├── .github
└── copilot-instructions.md
├── .gitignore
├── .gitmodules
├── README.md
├── checkpoints
└── .gitkeep
├── configs
├── dataset
│ ├── nclt
│ │ ├── all_camera_lidar.yaml
│ │ ├── all_camera_semantic.yaml
│ │ ├── all_camera_semantic_lidar.yaml
│ │ ├── all_camera_semantic_text.yaml
│ │ ├── all_camera_semantic_text_lidar.yaml
│ │ ├── all_camera_text.yaml
│ │ ├── all_camera_text_lidar.yaml
│ │ ├── camera1.yaml
│ │ ├── camera1_lidar.yaml
│ │ ├── camera2-front-back.yaml
│ │ ├── camera2-left-right.yaml
│ │ ├── camera5.yaml
│ │ ├── lidar.yaml
│ │ ├── semantic1.yaml
│ │ ├── semantic2-front-back.yaml
│ │ ├── semantic2-left-right.yaml
│ │ ├── semantic5.yaml
│ │ ├── text1_clip-base.yaml
│ │ ├── text1_clip-large.yaml
│ │ ├── text1_tfidf.yaml
│ │ ├── text2-front-back_clip-base.yaml
│ │ ├── text2-front-back_clip-large.yaml
│ │ ├── text2-front-back_tfidf.yaml
│ │ ├── text2-left-right_clip-base.yaml
│ │ ├── text2-left-right_clip-large.yaml
│ │ ├── text2-left-right_tfidf.yaml
│ │ ├── text5_clip-base.yaml
│ │ ├── text5_clip-large.yaml
│ │ └── text5_tfidf.yaml
│ └── oxford
│ │ ├── all_camera_lidar.yaml
│ │ ├── all_camera_semantic.yaml
│ │ ├── all_camera_semantic_lidar.yaml
│ │ ├── all_camera_semantic_text.yaml
│ │ ├── all_camera_semantic_text_lidar.yaml
│ │ ├── all_camera_text.yaml
│ │ ├── all_camera_text_lidar.yaml
│ │ ├── camera1.yaml
│ │ ├── camera1_lidar.yaml
│ │ ├── camera2-front-back.yaml
│ │ ├── camera2-left-right.yaml
│ │ ├── camera4.yaml
│ │ ├── lidar.yaml
│ │ ├── semantic1.yaml
│ │ ├── semantic2-front-back.yaml
│ │ ├── semantic2-left-right.yaml
│ │ ├── semantic4.yaml
│ │ ├── text1_clip-base.yaml
│ │ ├── text1_clip-large.yaml
│ │ ├── text1_tfidf.yaml
│ │ ├── text2-front-back_clip-base.yaml
│ │ ├── text2-front-back_clip-large.yaml
│ │ ├── text2-front-back_tfidf.yaml
│ │ ├── text2-left-right_clip-base.yaml
│ │ ├── text2-left-right_clip-large.yaml
│ │ ├── text2-left-right_tfidf.yaml
│ │ ├── text4_clip-base.yaml
│ │ ├── text4_clip-large.yaml
│ │ └── text4_tfidf.yaml
├── loss
│ └── batch_hard_triplet_margin.yaml
├── model
│ ├── camera1.yaml
│ ├── camera2_add.yaml
│ ├── camera2_concat.yaml
│ ├── camera2_gem.yaml
│ ├── camera2_mlp-full.yaml
│ ├── camera2_mlp-half.yaml
│ ├── camera2_sa-add.yaml
│ ├── camera2_sa-concat.yaml
│ ├── convnext_camera1.yaml
│ ├── convnext_semantic1.yaml
│ ├── lidar.yaml
│ ├── minkloc-multimodal.yaml
│ ├── minkloc3dv2.yaml
│ ├── mssplace-i.yaml
│ ├── mssplace-li.yaml
│ ├── mssplace-lis.yaml
│ ├── mssplace-list.yaml
│ ├── mssplace-lit.yaml
│ ├── semantic1.yaml
│ ├── semantic2_add.yaml
│ ├── semantic2_concat.yaml
│ ├── semantic2_mlp-full.yaml
│ ├── semantic2_mlp-half.yaml
│ ├── semantic2_sa-add.yaml
│ ├── semantic2_sa-concat.yaml
│ ├── text1_clip-base-mlp.yaml
│ ├── text1_clip-large-mlp.yaml
│ ├── text1_tfidf-mlp.yaml
│ ├── text2_clip-base-mlp-add.yaml
│ ├── text2_clip-base-mlp-concat.yaml
│ ├── text2_clip-large-mlp-add.yaml
│ ├── text2_clip-large-mlp-concat.yaml
│ ├── text2_tfidf-mlp-add.yaml
│ └── text2_tfidf-mlp-concat.yaml
├── optimizer
│ └── adam.yaml
├── sampler
│ └── batch_sampler.yaml
├── scheduler
│ └── multi_step.yaml
└── train_unimodal.yaml
├── docker
├── Dockerfile.cuda
├── build.sh
├── into.sh
└── start.sh
├── images
└── mssplace_overview.jpg
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── scripts
└── evaluation
│ ├── evaluate_checkpoints.py
│ └── failure_cases.ipynb
├── src
└── mssplace
│ ├── __init__.py
│ ├── datasets.py
│ ├── modality_interaction_layers.py
│ └── models.py
└── train_unimodal.py
/.github/copilot-instructions.md:
--------------------------------------------------------------------------------
1 | ---
2 | applyTo: '**'
3 | ---
4 |
5 | # Project-Specific Guidance – Research Reproducibility
6 |
7 | ## 🧪 Research Code Clarity
8 |
9 | - Prioritize code **readability and clarity** over performance.
10 | - Use **explicit, well-named variables** and **clear control flow**.
11 |
12 | ## 🧾 Reproducibility and Documentation
13 |
14 | - Ensure all code is **well-documented** with in-line comments and Google-style docstrings.
15 | - Write **comprehensive Markdown instructions** for reproducing experiments.
16 |
17 | ## 🔬 PyTorch and Python Modernity
18 |
19 | - Use **Python 3.10+** and **PyTorch 2.1+** features idiomatically.
20 | - Prefer built-in types and modern syntax for clarity.
21 |
22 | ## Goal
23 |
24 | Support reproducible research with clean, understandable code and thorough documentation.
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | checkpoints/*
2 | !checkpoints/.gitkeep
3 | outputs/
4 | .ruff_cache/
5 |
6 | # Development and organization files (keep local, don't publish)
7 | work_in_progress/
8 |
9 | ### Defaults for Python:
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .nox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | *.py,cover
60 | .hypothesis/
61 | .pytest_cache/
62 | cover/
63 |
64 | # Translations
65 | *.mo
66 | *.pot
67 |
68 | # Django stuff:
69 | *.log
70 | local_settings.py
71 | db.sqlite3
72 | db.sqlite3-journal
73 |
74 | # Flask stuff:
75 | instance/
76 | .webassets-cache
77 |
78 | # Scrapy stuff:
79 | .scrapy
80 |
81 | # Sphinx documentation
82 | docs/_build/
83 |
84 | # PyBuilder
85 | .pybuilder/
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # IPython
92 | profile_default/
93 | ipython_config.py
94 |
95 | # pyenv
96 | # For a library or package, you might want to ignore these files since the code is
97 | # intended to run in multiple environments; otherwise, check them in:
98 | # .python-version
99 |
100 | # pipenv
101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | # install all needed dependencies.
105 | #Pipfile.lock
106 |
107 | # poetry
108 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109 | # This is especially recommended for binary packages to ensure reproducibility, and is more
110 | # commonly ignored for libraries.
111 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112 | #poetry.lock
113 |
114 | # pdm
115 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116 | #pdm.lock
117 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
118 | # in version control.
119 | # https://pdm.fming.dev/#use-with-ide
120 | .pdm.toml
121 |
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 |
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 |
129 | # SageMath parsed files
130 | *.sage.py
131 |
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 |
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 |
145 | # Rope project settings
146 | .ropeproject
147 |
148 | # mkdocs documentation
149 | /site
150 |
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 |
156 | # Pyre type checker
157 | .pyre/
158 |
159 | # pytype static type analyzer
160 | .pytype/
161 |
162 | # Cython debug symbols
163 | cython_debug/
164 |
165 | # PyCharm
166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | # and can be added to the global gitignore or merged into this file. For a more nuclear
169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/OpenPlaceRecognition"]
2 | path = third_party/OpenPlaceRecognition
3 | url = https://github.com/alexmelekhin/OpenPlaceRecognition.git
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics
2 |
3 | This repository contains the code for the paper "MSSPlace: Multi-Sensor Place Recognition with Visual and Text Semantics".
4 |
5 | 
6 |
7 | _High-level overview of the proposed multimodal MSSPlace method. The MSSPlace Model has a modular architecture and consists of four branches: the Image Encoder, Semantic Masks Encoder, Text Encoder, and Point Cloud Encoder. Each branch encodes the input data into a descriptor, capturing the essential information specific to its respective modality. Subsequently, a descriptor aggregation step is performed to combine these individual descriptors and obtain the global place descriptor, which represents the comprehensive characteristics of the vehicle place._
8 |
9 | ## Installation
10 |
11 | Initialize submodules and build the Docker environment:
12 |
13 | ```bash
14 | git submodule update --init --recursive
15 | bash docker/build.sh
16 | bash docker/start.sh [DATASETS_DIR] # DATASETS_DIR will be mounted at /home/docker_mssplace/Datasets
17 | bash docker/into.sh
18 | ```
19 |
20 | ### Package Installation
21 |
22 | Install the MSSPlace package in editable mode for development:
23 |
24 | ```bash
25 | pip install -e .
26 | ```
27 |
28 | This installs the package from the `src/mssplace` directory, allowing you to import `mssplace` modules directly without path modifications.
29 |
30 | ## Quick Start
31 |
32 | Evaluate pre-trained models on Oxford RobotCar or NCLT datasets:
33 |
34 | ```bash
35 | # Download checkpoints and datasets first (see sections below)
36 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li
37 | python scripts/evaluation/evaluate_checkpoints.py --dataset nclt --model mssplace-list --verbose
38 | ```
39 |
40 | ## Evaluation
41 |
42 | ### Performance Metrics
43 |
44 | - **AR@1**: Accuracy (%) when considering top-1 retrieval match
45 | - **AR@1%**: Accuracy (%) when considering top-1% of database as potential matches
46 |
47 | ### Model Variants
48 |
49 | | Model | Modalities | AR@1 (Oxford) | AR@1% (Oxford) | AR@1 (NCLT) | AR@1% (NCLT) | Description |
50 | |-------|------------|---------------|----------------|-------------|--------------|-------------|
51 | | `mssplace-li` | LiDAR + Images | 98.21% | 99.53% | 94.67% | 97.72% | Basic multimodal |
52 | | `mssplace-lis` | LiDAR + Images + Semantic | **98.55%** | **99.64%** | **95.37%** | **97.84%** | Adds semantic segmentation |
53 | | `mssplace-lit` | LiDAR + Images + Text | 98.22% | 99.53% | 92.36% | 96.51% | Adds text descriptions |
54 | | `mssplace-list` | LiDAR + Images + Semantic + Text | **98.55%** | **99.64%** | 94.15% | 96.97% | Complete multimodal |
55 |
56 | *Performance metrics measured on Oxford RobotCar and NCLT datasets. Best results per dataset highlighted in bold.*
57 |
58 | **Key Insights:**
59 | - `mssplace-lis` achieves the best performance on NCLT, while both `mssplace-lis` and `mssplace-list` tie for best on Oxford
60 | - Semantic segmentation consistently helps place recognition across both datasets
61 | - Text modality shows dataset-dependent behavior: hurts performance on NCLT but is neutral on Oxford
62 | - Oxford dataset appears easier than NCLT (all models achieve >98% vs 92-95% AR@1)
63 | - The complete multimodal `mssplace-list` performs well but doesn't consistently exceed semantic-only variants
64 |
65 | ### Pre-trained Checkpoints
66 |
67 | ⚠️ **Work in Progress**: Checkpoint download links will be updated soon. Please check back later for access to pre-trained models.
68 |
69 | ### Datasets
70 |
71 | ⚠️ **Work in Progress**: Preprocessed datasets will be made publicly available for download soon. Please check back later for dataset access.
72 |
73 | ### Directory Structure
74 |
75 | ```
76 | /home/docker_mssplace/
77 | ├── MSSPlace/ # This repository
78 | │ ├── checkpoints/ # Downloaded checkpoints
79 | │ ├── configs/ # Configuration files
80 | │ ├── docker/ # Docker environment setup
81 | │ ├── docs/ # Documentation and examples
82 | │ ├── images/ # Example images and figures
83 | │ ├── scripts/ # Organized scripts
84 | │ ├── src/ # Core source code
85 | │ └── third_party/ # External dependencies
86 | │ └── OpenPlaceRecognition/ # Core OPR library
87 | └── Datasets/ # Dataset directory (configurable with --datasets-dir)
88 | ├── pnvlad_oxford_robotcar/
89 | └── NCLT_preprocessed/
90 | ```
91 |
92 | ### Key Arguments
93 |
94 | | Argument | Default | Description |
95 | |----------|---------|-------------|
96 | | `--dataset` | *Required* | `oxford` or `nclt` |
97 | | `--model` | *Required* | Model variant (see table above) |
98 | | `--datasets-dir` | `/home/docker_mssplace/Datasets` | Path to datasets directory |
99 | | `--checkpoints-dir` | `./checkpoints` | Path to model checkpoints |
100 | | `--configs-dir` | `./configs/model` | Path to model configurations |
101 | | `--batch-size` | `32` | Evaluation batch size |
102 | | `--verbose` | `False` | Enable detailed logging |
103 |
104 | **Example Usage:**
105 | ```bash
106 | # Basic evaluation
107 | python scripts/evaluation/evaluate_checkpoints.py --dataset oxford --model mssplace-li
108 |
109 | # Custom dataset location
110 | python scripts/evaluation/evaluate_checkpoints.py \
111 | --dataset nclt --model mssplace-lis \
112 | --datasets-dir /path/to/your/datasets \
113 | --verbose
114 | ```
115 |
116 | ## Training (Optional)
117 |
118 | ⚠️ **Work in Progress**: Training documentation and scripts will be updated soon. Please check back later for training instructions.
119 |
120 | ## Troubleshooting
121 |
122 | - **Missing checkpoints**: Download all `.pth` files to `checkpoints/`
123 | - **Dataset errors**: Verify directory structure matches expected format
124 | - **CUDA memory**: Reduce `--batch-size` if out-of-memory
125 | - **Dependencies**: Use provided Docker environment
126 |
--------------------------------------------------------------------------------
/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/checkpoints/.gitkeep
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | pointcloud_lidar,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
6 | pointcloud_lidar,]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_text.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
6 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_semantic_text_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,
6 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,
7 | pointcloud_lidar,]
8 | positive_threshold: 10.0
9 | negative_threshold: 50.0
10 | images_dirname: images_small
11 | masks_dirname: segmentation_masks_small
12 | text_embeddings_dirname: clip-vit-base-patch32
13 | pointclouds_dirname: velodyne_data
14 | pointcloud_quantization_size: 0.5
15 | max_point_distance: 100.0
16 | spherical_coords: False
17 | use_intensity_values: False
18 | image_transform: null
19 | semantic_transform: null
20 | pointcloud_transform: null
21 | pointcloud_set_transform: null
22 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_text.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/all_camera_text_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,
5 | text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,
6 | pointcloud_lidar,]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: velodyne_data
13 | pointcloud_quantization_size: 0.5
14 | max_point_distance: 100.0
15 | spherical_coords: False
16 | use_intensity_values: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/camera1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/camera1_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1,
5 | pointcloud_lidar,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: velodyne_data
12 | pointcloud_quantization_size: 0.5
13 | max_point_distance: 100.0
14 | spherical_coords: False
15 | use_intensity_values: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/camera2-front-back.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam5, image_Cam2]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/camera2-left-right.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam4]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/camera5.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_Cam1, image_Cam2, image_Cam3, image_Cam4, image_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [pointcloud_lidar,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic2-front-back.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_Cam5, mask_Cam2]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic2-left-right.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_Cam1, mask_Cam4]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/semantic5.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.nclt.NCLTDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_Cam1, mask_Cam2, mask_Cam3, mask_Cam4, mask_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: velodyne_data
10 | pointcloud_quantization_size: 0.5
11 | max_point_distance: 100.0
12 | spherical_coords: False
13 | use_intensity_values: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text1_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5, text_Cam2,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5, text_Cam2,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-front-back_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam5, text_Cam2,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam4,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam4,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text2-left-right_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam4,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/nclt/text5_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.NCLTDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_Cam1, text_Cam2, text_Cam3, text_Cam4, text_Cam5,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: velodyne_data
11 | pointcloud_quantization_size: 0.5
12 | max_point_distance: 100.0
13 | spherical_coords: False
14 | use_intensity_values: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | pointcloud_lidar,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
6 | pointcloud_lidar,]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_text.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
6 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_semantic_text_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right,
6 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,
7 | pointcloud_lidar,]
8 | positive_threshold: 10.0
9 | negative_threshold: 50.0
10 | images_dirname: images_small
11 | masks_dirname: segmentation_masks_small
12 | text_embeddings_dirname: clip-vit-base-patch32
13 | pointclouds_dirname: null
14 | pointcloud_quantization_size: 0.01
15 | max_point_distance: null
16 | spherical_coords: False
17 | image_transform: null
18 | semantic_transform: null
19 | pointcloud_transform: null
20 | pointcloud_set_transform: null
21 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_text.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/all_camera_text_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,
5 | text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,
6 | pointcloud_lidar,]
7 | positive_threshold: 10.0
8 | negative_threshold: 50.0
9 | images_dirname: images_small
10 | masks_dirname: segmentation_masks_small
11 | text_embeddings_dirname: clip-vit-base-patch32
12 | pointclouds_dirname: null
13 | pointcloud_quantization_size: 0.01
14 | max_point_distance: null
15 | spherical_coords: False
16 | image_transform: null
17 | semantic_transform: null
18 | pointcloud_transform: null
19 | pointcloud_set_transform: null
20 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/camera1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/camera1_lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre,
5 | pointcloud_lidar,]
6 | positive_threshold: 10.0
7 | negative_threshold: 50.0
8 | images_dirname: images_small
9 | masks_dirname: segmentation_masks_small
10 | text_embeddings_dirname: clip-vit-base-patch32
11 | pointclouds_dirname: null
12 | pointcloud_quantization_size: 0.01
13 | max_point_distance: null
14 | spherical_coords: False
15 | image_transform: null
16 | semantic_transform: null
17 | pointcloud_transform: null
18 | pointcloud_set_transform: null
19 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/camera2-front-back.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_rear]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/camera2-left-right.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_mono_left, image_mono_right]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/camera4.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [image_stereo_centre, image_mono_left, image_mono_rear, image_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [pointcloud_lidar,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_stereo_centre,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic2-front-back.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_stereo_centre, mask_mono_rear]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic2-left-right.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_mono_left, mask_mono_right]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/semantic4.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.datasets.OxfordDataset
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [mask_stereo_centre, mask_mono_left, mask_mono_rear, mask_mono_right]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | pointclouds_dirname: null
10 | pointcloud_quantization_size: 0.01
11 | max_point_distance: null
12 | spherical_coords: False
13 | image_transform: null
14 | semantic_transform: null
15 | pointcloud_transform: null
16 | pointcloud_set_transform: null
17 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text1_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_rear,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_rear,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-front-back_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_rear,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_mono_left, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_mono_left, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text2-left-right_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_mono_left, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_clip-base.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-base-patch32
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_clip-large.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: clip-vit-large-patch14
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/dataset/oxford/text4_tfidf.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.datasets.OxfordDatasetWithText
2 |
3 | dataset_root: /path/to/dataset
4 | data_to_load: [text_stereo_centre, text_mono_left, text_mono_rear, text_mono_right,]
5 | positive_threshold: 10.0
6 | negative_threshold: 50.0
7 | images_dirname: images_small
8 | masks_dirname: segmentation_masks_small
9 | text_embeddings_dirname: tfidf_pca
10 | pointclouds_dirname: null
11 | pointcloud_quantization_size: 0.01
12 | max_point_distance: null
13 | spherical_coords: False
14 | image_transform: null
15 | semantic_transform: null
16 | pointcloud_transform: null
17 | pointcloud_set_transform: null
18 |
--------------------------------------------------------------------------------
/configs/loss/batch_hard_triplet_margin.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.losses.BatchHardTripletMarginLoss
2 |
3 | margin: 0.2
4 |
--------------------------------------------------------------------------------
/configs/model/camera1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.ResNet18
2 |
3 | in_channels: 3
4 | out_channels: 256
5 | num_top_down: 0
6 | pooling: gem
7 |
--------------------------------------------------------------------------------
/configs/model/camera2_add.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: opr.modules.Add
14 |
--------------------------------------------------------------------------------
/configs/model/camera2_concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: opr.modules.Concat
14 |
--------------------------------------------------------------------------------
/configs/model/camera2_gem.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: opr.modules.GeMFusion
14 |
--------------------------------------------------------------------------------
/configs/model/camera2_mlp-full.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.Concat
16 | - _target_: opr.modules.MLP
17 | in_features: 512
18 | out_features: 512
19 |
--------------------------------------------------------------------------------
/configs/model/camera2_mlp-half.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.Concat
16 | - _target_: opr.modules.MLP
17 | in_features: 512
18 | out_features: 256
19 |
--------------------------------------------------------------------------------
/configs/model/camera2_sa-add.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.SelfAttention
16 | embed_size: 256
17 | - _target_: opr.modules.Add
18 |
--------------------------------------------------------------------------------
/configs/model/camera2_sa-concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 3
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: True
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.SelfAttention
16 | embed_size: 256
17 | - _target_: opr.modules.Concat
18 |
--------------------------------------------------------------------------------
/configs/model/convnext_camera1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor
5 | in_channels: 3
6 | pretrained: True
7 | head:
8 | _target_: torch.nn.Sequential
9 | _args_:
10 | - _target_: opr.modules.GeM
11 | - _target_: torch.nn.Linear
12 | in_features: 768
13 | out_features: 256
14 |
--------------------------------------------------------------------------------
/configs/model/convnext_semantic1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ConvNeXtTinyFeatureExtractor
5 | in_channels: 1
6 | pretrained: False
7 | head:
8 | _target_: torch.nn.Sequential
9 | _args_:
10 | - _target_: opr.modules.GeM
11 | - _target_: torch.nn.Linear
12 | in_features: 768
13 | out_features: 256
14 |
--------------------------------------------------------------------------------
/configs/model/lidar.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.MinkLoc3Dv2
2 |
3 | in_channels: 1
4 | out_channels: 256
5 | num_top_down: 2
6 | conv0_kernel_size: 5
7 | block: ECABasicBlock
8 | layers: [1, 1, 1, 1]
9 | planes: [64, 128, 64, 32]
10 | pooling: gem
11 |
--------------------------------------------------------------------------------
/configs/model/minkloc-multimodal.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.LateFusionModel
2 |
3 | image_module:
4 | _target_: opr.models.place_recognition.ResNet18
5 | in_channels: 3
6 | out_channels: 128
7 | num_top_down: 0
8 | pooling: gem
9 |
10 | cloud_module:
11 | _target_: opr.models.place_recognition.MinkLoc3D
12 | in_channels: 1
13 | out_channels: 128
14 | num_top_down: 1
15 | conv0_kernel_size: 5
16 | block: ECABasicBlock
17 | layers: [1, 1, 1]
18 | planes: [32, 64, 64]
19 | pooling: gem
20 |
21 | fusion_module:
22 | _target_: opr.modules.Concat
23 |
--------------------------------------------------------------------------------
/configs/model/minkloc3dv2.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.MinkLoc3Dv2
2 |
3 | in_channels: 1
4 | out_channels: 256
5 | num_top_down: 2
6 | conv0_kernel_size: 5
7 | block: ECABasicBlock
8 | layers: [1, 1, 1, 1]
9 | planes: [64, 128, 64, 32]
10 | pooling: gem
11 |
--------------------------------------------------------------------------------
/configs/model/mssplace-i.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.ImageModel
2 | backbone:
3 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
4 | in_channels: 3
5 | lateral_dim: 256
6 | fh_num_bottom_up: 4
7 | fh_num_top_down: 0
8 | pretrained: True
9 | head:
10 | _target_: opr.modules.GeM
11 | fusion:
12 | _target_: opr.modules.Add
13 |
--------------------------------------------------------------------------------
/configs/model/mssplace-li.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.LateFusionModel
2 |
3 | image_module:
4 | _target_: opr.models.place_recognition.base.ImageModel
5 | backbone:
6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
7 | in_channels: 3
8 | lateral_dim: 256
9 | fh_num_bottom_up: 4
10 | fh_num_top_down: 0
11 | pretrained: True
12 | head:
13 | _target_: opr.modules.GeM
14 | fusion:
15 | _target_: opr.modules.Add
16 |
17 | cloud_module:
18 | _target_: opr.models.place_recognition.MinkLoc3Dv2
19 | in_channels: 1
20 | out_channels: 256
21 | num_top_down: 2
22 | conv0_kernel_size: 5
23 | block: ECABasicBlock
24 | layers: [1, 1, 1, 1]
25 | planes: [64, 128, 64, 32]
26 | pooling: gem
27 |
28 | fusion_module:
29 | _target_: opr.modules.Concat
30 |
--------------------------------------------------------------------------------
/configs/model/mssplace-lis.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.LateFusionModel
2 |
3 | image_module:
4 | _target_: opr.models.place_recognition.base.ImageModel
5 | backbone:
6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
7 | in_channels: 3
8 | lateral_dim: 256
9 | fh_num_bottom_up: 4
10 | fh_num_top_down: 0
11 | pretrained: True
12 | head:
13 | _target_: opr.modules.GeM
14 | fusion:
15 | _target_: opr.modules.Add
16 |
17 | cloud_module:
18 | _target_: opr.models.place_recognition.MinkLoc3Dv2
19 | in_channels: 1
20 | out_channels: 256
21 | num_top_down: 2
22 | conv0_kernel_size: 5
23 | block: ECABasicBlock
24 | layers: [1, 1, 1, 1]
25 | planes: [64, 128, 64, 32]
26 | pooling: gem
27 |
28 | semantic_module:
29 | _target_: opr.models.place_recognition.base.SemanticModel
30 | backbone:
31 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
32 | in_channels: 1
33 | lateral_dim: 256
34 | fh_num_bottom_up: 4
35 | fh_num_top_down: 0
36 | pretrained: False
37 | head:
38 | _target_: opr.modules.GeM
39 | fusion:
40 | _target_: opr.modules.Add
41 |
42 | fusion_module:
43 | _target_: opr.modules.Concat
44 |
--------------------------------------------------------------------------------
/configs/model/mssplace-list.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.LateFusionModel
2 |
3 | image_module:
4 | _target_: opr.models.place_recognition.base.ImageModel
5 | backbone:
6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
7 | in_channels: 3
8 | lateral_dim: 256
9 | fh_num_bottom_up: 4
10 | fh_num_top_down: 0
11 | pretrained: True
12 | head:
13 | _target_: opr.modules.GeM
14 | fusion:
15 | _target_: opr.modules.Add
16 |
17 | cloud_module:
18 | _target_: opr.models.place_recognition.MinkLoc3Dv2
19 | in_channels: 1
20 | out_channels: 256
21 | num_top_down: 2
22 | conv0_kernel_size: 5
23 | block: ECABasicBlock
24 | layers: [1, 1, 1, 1]
25 | planes: [64, 128, 64, 32]
26 | pooling: gem
27 |
28 | semantic_module:
29 | _target_: opr.models.place_recognition.base.SemanticModel
30 | backbone:
31 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
32 | in_channels: 1
33 | lateral_dim: 256
34 | fh_num_bottom_up: 4
35 | fh_num_top_down: 0
36 | pretrained: False
37 | head:
38 | _target_: opr.modules.GeM
39 | fusion:
40 | _target_: opr.modules.Add
41 |
42 | text_module:
43 | _target_: src.models.TextModel
44 | model:
45 | _target_: opr.modules.MLP
46 | in_features: 512
47 | out_features: 256
48 | drop: 0.5
49 | fusion:
50 | _target_: opr.modules.Add
51 |
52 | fusion_module:
53 | _target_: opr.modules.Concat
54 |
--------------------------------------------------------------------------------
/configs/model/mssplace-lit.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.LateFusionModel
2 |
3 | image_module:
4 | _target_: opr.models.place_recognition.base.ImageModel
5 | backbone:
6 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
7 | in_channels: 3
8 | lateral_dim: 256
9 | fh_num_bottom_up: 4
10 | fh_num_top_down: 0
11 | pretrained: True
12 | head:
13 | _target_: opr.modules.GeM
14 | fusion:
15 | _target_: opr.modules.Add
16 |
17 | cloud_module:
18 | _target_: opr.models.place_recognition.MinkLoc3Dv2
19 | in_channels: 1
20 | out_channels: 256
21 | num_top_down: 2
22 | conv0_kernel_size: 5
23 | block: ECABasicBlock
24 | layers: [1, 1, 1, 1]
25 | planes: [64, 128, 64, 32]
26 | pooling: gem
27 |
28 | text_module:
29 | _target_: src.models.TextModel
30 | model:
31 | _target_: opr.modules.MLP
32 | in_features: 512
33 | out_features: 256
34 | drop: 0.5
35 | fusion:
36 | _target_: opr.modules.Add
37 |
38 | fusion_module:
39 | _target_: opr.modules.Concat
40 |
--------------------------------------------------------------------------------
/configs/model/semantic1.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.SemanticResNet18
2 |
3 | in_channels: 1
4 | out_channels: 256
5 | num_top_down: 0
6 | pooling: gem
7 |
--------------------------------------------------------------------------------
/configs/model/semantic2_add.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: opr.modules.Add
14 |
--------------------------------------------------------------------------------
/configs/model/semantic2_concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: opr.modules.Concat
14 |
--------------------------------------------------------------------------------
/configs/model/semantic2_mlp-full.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.Concat
16 | - _target_: opr.modules.MLP
17 | in_features: 512
18 | out_features: 512
19 |
--------------------------------------------------------------------------------
/configs/model/semantic2_mlp-half.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.Concat
16 | - _target_: opr.modules.MLP
17 | in_features: 512
18 | out_features: 256
19 |
--------------------------------------------------------------------------------
/configs/model/semantic2_sa-add.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.SelfAttention
16 | embed_size: 256
17 | - _target_: opr.modules.Add
18 |
--------------------------------------------------------------------------------
/configs/model/semantic2_sa-concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.models.place_recognition.base.SemanticModel
2 |
3 | backbone:
4 | _target_: opr.modules.feature_extractors.ResNet18FPNFeatureExtractor
5 | in_channels: 1
6 | lateral_dim: 256
7 | fh_num_bottom_up: 4
8 | fh_num_top_down: 0
9 | pretrained: False
10 | head:
11 | _target_: opr.modules.GeM
12 | fusion:
13 | _target_: torch.nn.Sequential
14 | _args_:
15 | - _target_: opr.modules.SelfAttention
16 | embed_size: 256
17 | - _target_: opr.modules.Concat
18 |
--------------------------------------------------------------------------------
/configs/model/text1_clip-base-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 512
6 | out_features: 256
7 | drop: 0.5
8 | fusion: null
9 |
--------------------------------------------------------------------------------
/configs/model/text1_clip-large-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 768
6 | out_features: 256
7 | drop: 0.5
8 | fusion: null
9 |
--------------------------------------------------------------------------------
/configs/model/text1_tfidf-mlp.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 128
6 | out_features: 128
7 | drop: 0.5
8 | fusion: null
9 |
--------------------------------------------------------------------------------
/configs/model/text2_clip-base-mlp-add.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 512
6 | out_features: 256
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Add
10 |
--------------------------------------------------------------------------------
/configs/model/text2_clip-base-mlp-concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 512
6 | out_features: 256
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Concat
10 |
--------------------------------------------------------------------------------
/configs/model/text2_clip-large-mlp-add.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 768
6 | out_features: 256
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Add
10 |
--------------------------------------------------------------------------------
/configs/model/text2_clip-large-mlp-concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 768
6 | out_features: 256
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Concat
10 |
--------------------------------------------------------------------------------
/configs/model/text2_tfidf-mlp-add.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 128
6 | out_features: 128
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Add
10 |
--------------------------------------------------------------------------------
/configs/model/text2_tfidf-mlp-concat.yaml:
--------------------------------------------------------------------------------
1 | _target_: src.models.TextModel
2 |
3 | model:
4 | _target_: opr.modules.MLP
5 | in_features: 128
6 | out_features: 128
7 | drop: 0.5
8 | fusion:
9 | _target_: opr.modules.Concat
10 |
--------------------------------------------------------------------------------
/configs/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | _convert_: all
3 |
4 | lr: 0.001
5 | weight_decay: 0.0001
6 |
--------------------------------------------------------------------------------
/configs/sampler/batch_sampler.yaml:
--------------------------------------------------------------------------------
1 | _target_: opr.samplers.BatchSampler
2 |
3 | batch_size: 16
4 | batch_size_limit: 128
5 | batch_expansion_rate: 1.4
6 | max_batches: null
7 | positives_per_group: 2
8 | seed: ${seed}
9 | drop_last: True
10 |
--------------------------------------------------------------------------------
/configs/scheduler/multi_step.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.lr_scheduler.MultiStepLR
2 | gamma: 0.1
3 | milestones: [40, 60]
4 |
--------------------------------------------------------------------------------
/configs/train_unimodal.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - _self_
3 | - dataset: nclt/lidar
4 | - sampler: batch_sampler
5 | - model: lidar
6 | - loss: batch_hard_triplet_margin
7 | - optimizer: adam
8 | - scheduler: multi_step
9 |
10 | wandb:
11 | disabled: false
12 | project: CVPR2024
13 |
14 | debug: false
15 | device: cuda
16 | seed: 3121999
17 | num_workers: 4
18 |
19 | exp_name: ???
20 |
21 | epochs: 80
22 | batch_expansion_threshold: 0.7
23 |
--------------------------------------------------------------------------------
/docker/Dockerfile.cuda:
--------------------------------------------------------------------------------
1 | FROM alexmelekhin/open-place-recognition:base
2 |
3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
4 | libcairo2-dev \
5 | libgirepository1.0-dev \
6 | libdbus-1-dev \
7 | libdbus-glib-1-dev \
8 | && rm -rf /var/lib/apt/lists/*
9 |
10 | # to install "dvc[gdrive]" we need to install "distro" package first
11 | ARG DISTRO_VERSION=1.9.0
12 | RUN pip install distro==${DISTRO_VERSION}
13 |
14 | # install other requirements from requirements.txt
15 | COPY requirements.txt .
16 | RUN pip install -r requirements.txt && \
17 | rm requirements.txt
18 |
19 | # add user and his password
20 | ENV USER=docker_mssplace
21 | ARG UID=1000
22 | ARG GID=1000
23 | # default password
24 | ARG PW=user
25 |
26 | RUN useradd -m ${USER} --uid=${UID} && echo "${USER}:${PW}" | chpasswd && adduser ${USER} sudo
27 | WORKDIR /home/${USER}
28 |
29 | # create some directories for mounting volumes
30 | RUN mkdir MSSPlace && chown -R ${UID}:${GID} /home/${USER}
31 | RUN mkdir Datasets && chown -R ${UID}:${GID} /home/${USER}
32 |
33 | USER ${UID}:${GID}
34 |
35 | # install OpenPlaceRecognition library
36 | COPY --chown=${UID}:${GID} ./third_party/OpenPlaceRecognition ./OpenPlaceRecognition
37 | RUN cd OpenPlaceRecognition && \
38 | pip install --user . && \
39 | cd .. && \
40 | rm -rf OpenPlaceRecognition
41 |
--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | orange=`tput setaf 3`
4 | reset_color=`tput sgr0`
5 |
6 | ARCH=`uname -m`
7 | if [ $ARCH != "x86_64" ]; then
8 | echo "${orange}${ARCH}${reset_color} architecture is not supported"
9 | exit 1
10 | fi
11 |
12 | if command -v nvidia-smi &> /dev/null; then
13 | echo "Detected ${orange}CUDA${reset_color} hardware"
14 | DOCKERFILE=Dockerfile.cuda
15 | DEVICE=cuda
16 | else
17 | echo "${orange}CPU-only${reset_color} build is not supported"
18 | exit 1
19 | fi
20 |
21 | echo "Building for ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}"
22 |
23 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd)
24 |
25 | docker build $PROJECT_ROOT_DIR \
26 | -f $PROJECT_ROOT_DIR/docker/$DOCKERFILE \
27 | --build-arg UID=$(id -u) \
28 | --build-arg GID=$(id -g) \
29 | -t mssplace:latest
30 |
--------------------------------------------------------------------------------
/docker/into.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker exec --user docker_mssplace -it ${USER}_mssplace \
4 | /bin/bash -c "cd /home/docker_mssplace; echo ${USER}_mssplace container; echo ; /bin/bash"
5 |
--------------------------------------------------------------------------------
/docker/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | orange=`tput setaf 3`
4 | reset_color=`tput sgr0`
5 |
6 | get_real_path(){
7 | if [ "${1:0:1}" == "/" ]; then
8 | echo "$1"
9 | else
10 | realpath -m "$PWD"/"$1"
11 | fi
12 | }
13 |
14 | ARCH=`uname -m`
15 | if [ $ARCH == "x86_64" ]; then
16 | if command -v nvidia-smi &> /dev/null; then
17 | DEVICE=cuda
18 | ARGS="--ipc host --gpus all -e NVIDIA_DRIVER_CAPABILITIES=all"
19 | else
20 | echo "${orange}CPU-only${reset_color} build is not supported"
21 | exit 1
22 | fi
23 | else
24 | echo "${orange}${ARCH}${reset_color} architecture is not supported"
25 | exit 1
26 | fi
27 |
28 | if [ $# != 1 ]; then
29 | echo "Usage:
30 | bash start.sh [DATASETS_DIR]
31 | "
32 | exit 1
33 | fi
34 |
35 | DATASETS_DIR=$(get_real_path "$1")
36 |
37 | if [ ! -d $DATASETS_DIR ]; then
38 | echo "Error: DATASETS_DIR=$DATASETS_DIR is not an existing directory."
39 | exit 1
40 | fi
41 |
42 | PROJECT_ROOT_DIR=$(cd ./"`dirname $0`"/.. || exit; pwd)
43 |
44 | echo "Running on ${orange}${ARCH}${reset_color} with ${orange}${DEVICE}${reset_color}"
45 |
46 | docker run -it -d --rm \
47 | $ARGS \
48 | --privileged \
49 | --name ${USER}_mssplace \
50 | --net host \
51 | -v $PROJECT_ROOT_DIR:/home/docker_mssplace/MSSPlace:rw \
52 | -v $DATASETS_DIR:/home/docker_mssplace/Datasets:rw \
53 | mssplace:latest
54 |
55 | docker exec --user root \
56 | ${USER}_mssplace bash -c "/etc/init.d/ssh start"
57 |
--------------------------------------------------------------------------------
/images/mssplace_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/images/mssplace_overview.jpg
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "mssplace"
7 | version = "0.1.0"
8 | description = "Multi-Sensor Place Recognition with Visual and Text Semantics"
9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = {text = "MIT"}
12 | authors = [
13 | {name = "MSSPlace Team"}
14 | ]
15 | keywords = ["place recognition", "multimodal", "computer vision", "robotics"]
16 | classifiers = [
17 | "Development Status :: 4 - Beta",
18 | "Intended Audience :: Developers",
19 | "Intended Audience :: Science/Research",
20 | "License :: OSI Approved :: MIT License",
21 | "Operating System :: OS Independent",
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.10",
24 | "Programming Language :: Python :: 3.11",
25 | "Programming Language :: Python :: 3.12",
26 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
27 | "Topic :: Scientific/Engineering :: Image Recognition",
28 | ]
29 |
30 | # Dependencies - using requirements.txt for now, can be moved here later
31 | dependencies = []
32 |
33 | [tool.setuptools.packages.find]
34 | where = ["src"]
35 |
36 | [tool.setuptools.package-dir]
37 | "" = "src"
38 |
39 | [tool.ruff]
40 | line-length = 110
41 | src = ["src"]
42 |
43 | [tool.ruff.format]
44 | quote-style = "double"
45 | indent-style = "space"
46 | skip-magic-trailing-comma = false
47 | line-ending = "auto"
48 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | ruff==0.11.11
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | albumentations==1.3.1
2 | antlr4-python3-runtime==4.9.3
3 | appdirs==1.4.4
4 | certifi==2025.4.26
5 | chardet==3.0.4
6 | charset-normalizer==3.3.2
7 | click==8.1.7
8 | contourpy==1.1.1
9 | cycler==0.12.1
10 | dbus-python==1.2.16
11 | distro==1.4.0
12 | docker-pycreds==0.4.0
13 | fonttools==4.45.1
14 | gitdb==4.0.11
15 | GitPython==3.1.44
16 | hydra-core==1.3.2
17 | idna==3.10
18 | imageio==2.33.0
19 | importlib-resources==6.1.1
20 | joblib==1.3.2
21 | kaleido==0.2.1
22 | kiwisolver==1.4.5
23 | lazy_loader==0.3
24 | loguru==0.7.2
25 | matplotlib==3.7.4
26 | networkx==3.1
27 | ninja==1.11.1.1
28 | numpy==1.24.4
29 | numpy-quaternion==2024.0.8
30 | omegaconf==2.3.0
31 | opencv-python==4.8.1.78
32 | opencv-python-headless==4.8.1.78
33 | packaging==23.2
34 | pandas==2.0.3
35 | Pillow==11.2.1
36 | plotly==5.18.0
37 | protobuf==4.25.1
38 | psutil==5.9.6
39 | PyGObject==3.36.0
40 | pyparsing==3.1.1
41 | python-dateutil==2.8.2
42 | pytorch-metric-learning==2.3.0
43 | pytz==2023.3.post1
44 | PyWavelets==1.4.1
45 | PyYAML==6.0.1
46 | qudida==0.0.4
47 | requests==2.32.3
48 | scikit-image==0.21.0
49 | scikit-learn==1.6.1
50 | scipy==1.10.1
51 | seaborn==0.13.2
52 | sentry-sdk==2.29.1
53 | setproctitle==1.3.3
54 | six==1.14.0
55 | smmap==5.0.1
56 | ssh-import-id==5.10
57 | tenacity==8.2.3
58 | threadpoolctl==3.2.0
59 | tifffile==2023.7.10
60 | tqdm==4.67.1
61 | typing_extensions==4.8.0
62 | tzdata==2023.3
63 | urllib3==2.4.0
64 | wandb==0.16.0
65 | zipp==3.21.0
66 |
--------------------------------------------------------------------------------
/scripts/evaluation/evaluate_checkpoints.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Checkpoint Evaluation Script for MSSPlace Models
4 |
5 | This script evaluates pre-trained MSSPlace model checkpoints on Oxford and NCLT
6 | datasets to verify the results reported in our paper. It provides a clean interface
7 | for testing different model variants with comprehensive logging and error handling.
8 |
9 | Note: This script evaluates released checkpoints only. For full experimental
10 | reproduction including training from scratch, refer to the training scripts
11 | (not publicly released).
12 |
13 | Key Features:
14 | - Supports text-enabled datasets for all model variants
15 | - Dynamic sensor setup configuration per model type
16 | - Loguru-based logging with colored output
17 | - Automatic sensor setup selection based on model name
18 |
19 | Usage:
20 | python evaluate_checkpoints.py --dataset oxford --model mssplace-li
21 | python evaluate_checkpoints.py --dataset nclt --model mssplace-lis --batch-size 16
22 | python evaluate_checkpoints.py --dataset oxford --model mssplace-lit --verbose
23 |
24 | Requirements:
25 | - PyTorch 2.1+
26 | - Python 3.10+
27 | - Hydra/OmegaConf for configuration management
28 | - Custom OPR (Open Place Recognition) library
29 | - loguru for enhanced logging
30 | - Custom datasets module with text support
31 |
32 | Author: Generated from what_is_in_checkpoint.ipynb
33 | Date: May 23, 2025
34 | """
35 |
36 | import argparse
37 | import sys
38 | from datetime import datetime
39 | from pathlib import Path
40 |
41 | import torch
42 | from omegaconf import OmegaConf
43 | from hydra.utils import instantiate
44 | from torch.utils.data import DataLoader
45 | from loguru import logger
46 |
47 | # Import custom modules from the OPR library
48 | from opr.testers.place_recognition.model import ModelTester, RetrievalResultsCollection
49 |
50 | # Import text-enabled datasets from the installed mssplace package
51 | from mssplace.datasets import NCLTDatasetWithText, OxfordDatasetWithText
52 |
53 |
54 | # Configuration constants following the original notebook structure
55 | DATASET_CHOICES = ["oxford", "nclt"]
56 |
57 | MODEL_CHOICES = [
58 | "mssplace-li",
59 | "mssplace-lis",
60 | "mssplace-lit",
61 | "mssplace-list",
62 | "mssplace-i",
63 | "minkloc-multimodal",
64 | "minkloc3dv2",
65 | ]
66 |
67 | MODEL_CONFIG_NAMES = {
68 | "mssplace-li": "mssplace-li.yaml",
69 | "mssplace-lis": "mssplace-lis.yaml",
70 | "mssplace-lit": "mssplace-lit.yaml",
71 | "mssplace-list": "mssplace-list.yaml",
72 | "mssplace-i": "mssplace-i.yaml",
73 | "minkloc-multimodal": "minkloc-multimodal.yaml",
74 | "minkloc3dv2": "minkloc3dv2.yaml",
75 | }
76 |
77 | CHECKPOINT_NAMES = {
78 | "oxford": {
79 | "mssplace-li": "oxford_mssplace_li.pth",
80 | "mssplace-lis": "oxford_mssplace_lis.pth",
81 | "mssplace-lit": "oxford_mssplace_lit.pth",
82 | "mssplace-list": "oxford_mssplace_list.pth",
83 | },
84 | "nclt": {
85 | "mssplace-li": "nclt_mssplace_li.pth",
86 | "mssplace-lis": "nclt_mssplace_lis.pth",
87 | "mssplace-lit": "nclt_mssplace_lit.pth",
88 | "mssplace-list": "nclt_mssplace_list.pth",
89 | "mssplace-i": "nclt_mssplace_i.pth",
90 | "minkloc-multimodal": "nclt_minkloc_multimodal.pth",
91 | "minkloc3dv2": "nclt_minkloc3dv2.pth",
92 | },
93 | }
94 |
95 | SENSOR_SETUPS = {
96 | "oxford": {
97 | "mssplace-li": [
98 | "pointcloud_lidar",
99 | "image_stereo_centre",
100 | "image_mono_left",
101 | "image_mono_rear",
102 | "image_mono_right"
103 | ],
104 | "mssplace-lis": [
105 | "pointcloud_lidar",
106 | "image_stereo_centre",
107 | "image_mono_left",
108 | "image_mono_rear",
109 | "image_mono_right",
110 | "mask_stereo_centre",
111 | "mask_mono_left",
112 | "mask_mono_rear",
113 | "mask_mono_right",
114 | ],
115 | "mssplace-lit": [
116 | "pointcloud_lidar",
117 | "image_stereo_centre",
118 | "image_mono_left",
119 | "image_mono_rear",
120 | "image_mono_right",
121 | "text_stereo_centre",
122 | "text_mono_left",
123 | "text_mono_rear",
124 | "text_mono_right",
125 | ],
126 | "mssplace-list": [
127 | "pointcloud_lidar",
128 | "image_stereo_centre",
129 | "image_mono_left",
130 | "image_mono_rear",
131 | "image_mono_right",
132 | "mask_stereo_centre",
133 | "mask_mono_left",
134 | "mask_mono_rear",
135 | "mask_mono_right",
136 | "text_stereo_centre",
137 | "text_mono_left",
138 | "text_mono_rear",
139 | "text_mono_right",
140 | ],
141 | },
142 | "nclt": {
143 | "mssplace-li": [
144 | "pointcloud_lidar",
145 | "image_Cam1",
146 | "image_Cam2",
147 | "image_Cam3",
148 | "image_Cam4",
149 | "image_Cam5"
150 | ],
151 | "mssplace-lis": [
152 | "pointcloud_lidar",
153 | "image_Cam1",
154 | "image_Cam2",
155 | "image_Cam3",
156 | "image_Cam4",
157 | "image_Cam5",
158 | "mask_Cam1",
159 | "mask_Cam2",
160 | "mask_Cam3",
161 | "mask_Cam4",
162 | "mask_Cam5"
163 | ],
164 | "mssplace-lit": [
165 | "pointcloud_lidar",
166 | "image_Cam1",
167 | "image_Cam2",
168 | "image_Cam3",
169 | "image_Cam4",
170 | "image_Cam5",
171 | "text_Cam1",
172 | "text_Cam2",
173 | "text_Cam3",
174 | "text_Cam4",
175 | "text_Cam5"
176 | ],
177 | "mssplace-list": [
178 | "pointcloud_lidar",
179 | "image_Cam1",
180 | "image_Cam2",
181 | "image_Cam3",
182 | "image_Cam4",
183 | "image_Cam5",
184 | "mask_Cam1",
185 | "mask_Cam2",
186 | "mask_Cam3",
187 | "mask_Cam4",
188 | "mask_Cam5",
189 | "text_Cam1",
190 | "text_Cam2",
191 | "text_Cam3",
192 | "text_Cam4",
193 | "text_Cam5"
194 | ],
195 | "mssplace-i": [
196 | "image_Cam1",
197 | "image_Cam2",
198 | "image_Cam3",
199 | "image_Cam4",
200 | "image_Cam5",
201 | ],
202 | "minkloc-multimodal": [
203 | "pointcloud_lidar",
204 | "image_Cam5",
205 | ],
206 | "minkloc3dv2": [
207 | "pointcloud_lidar",
208 | ],
209 | }
210 | }
211 |
212 |
213 | def setup_logging(verbose: bool = False) -> None:
214 | """
215 | Configure loguru logging for the script.
216 |
217 | Args:
218 | verbose: If True, set logging level to DEBUG, otherwise INFO
219 | """
220 | # Remove default logger
221 | logger.remove()
222 |
223 | # Configure loguru with appropriate level
224 | log_level = "DEBUG" if verbose else "INFO"
225 | logger.add(
226 | sys.stdout,
227 | level=log_level,
228 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}"
229 | )
230 |
231 |
232 | def validate_paths(datasets_dir: Path, checkpoint_dir: Path, config_dir: Path) -> None:
233 | """
234 | Validate that all required directories exist.
235 |
236 | Args:
237 | datasets_dir: Path to datasets directory
238 | checkpoint_dir: Path to checkpoints directory
239 | config_dir: Path to configs directory
240 |
241 | Raises:
242 | FileNotFoundError: If any required directory is missing
243 | """
244 | if not datasets_dir.exists():
245 | raise FileNotFoundError(f"Datasets directory does not exist: {datasets_dir}")
246 | if not checkpoint_dir.exists():
247 | raise FileNotFoundError(f"Checkpoints directory does not exist: {checkpoint_dir}")
248 | if not config_dir.exists():
249 | raise FileNotFoundError(f"Configs directory does not exist: {config_dir}")
250 |
251 |
252 | def get_dataset_path(dataset_name: str, datasets_dir: Path) -> Path:
253 | """
254 | Get the specific dataset path based on dataset name.
255 |
256 | Args:
257 | dataset_name: Name of the dataset ('oxford' or 'nclt')
258 | datasets_dir: Base datasets directory path
259 |
260 | Returns:
261 | Path to the specific dataset directory
262 | """
263 | if dataset_name == "oxford":
264 | return datasets_dir / "pnvlad_oxford_robotcar"
265 | elif dataset_name == "nclt":
266 | return datasets_dir / "NCLT_preprocessed"
267 | else:
268 | raise ValueError(f"Unknown dataset: {dataset_name}")
269 |
270 |
271 | def load_checkpoint(checkpoint_path: Path, device: str = "cpu") -> dict:
272 | """
273 | Load model checkpoint from file.
274 |
275 | Args:
276 | checkpoint_path: Path to checkpoint file
277 | device: Device to load checkpoint to
278 |
279 | Returns:
280 | Dictionary containing model state dict
281 |
282 | Raises:
283 | FileNotFoundError: If checkpoint file doesn't exist
284 | """
285 | if not checkpoint_path.exists():
286 | raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
287 |
288 | logger.info(f"Loading checkpoint from: {checkpoint_path}")
289 | checkpoint = torch.load(checkpoint_path, map_location=device)
290 |
291 | # Handle different checkpoint formats
292 | if "model_state_dict" in checkpoint:
293 | checkpoint = checkpoint["model_state_dict"]
294 |
295 | logger.info(f"Checkpoint loaded with {len(checkpoint.keys())} parameter groups")
296 | return checkpoint
297 |
298 |
299 | def load_model_config(config_path: Path) -> OmegaConf:
300 | """
301 | Load model configuration from YAML file.
302 |
303 | Args:
304 | config_path: Path to configuration file
305 |
306 | Returns:
307 | OmegaConf configuration object
308 |
309 | Raises:
310 | FileNotFoundError: If config file doesn't exist
311 | """
312 | if not config_path.exists():
313 | raise FileNotFoundError(f"Config not found at {config_path}")
314 |
315 | logger.info(f"Loading config from: {config_path}")
316 | config = OmegaConf.load(config_path)
317 |
318 | # Log config details for reproducibility
319 | config_dict = OmegaConf.to_container(config, resolve=True)
320 | logger.debug(f"Model configuration: {config_dict}")
321 |
322 | return config
323 |
324 |
325 | def create_dataset(dataset_name: str, data_dir: Path, sensor_setup: list[str]) -> torch.utils.data.Dataset:
326 | """
327 | Create dataset instance based on dataset name with specified sensor setup.
328 |
329 | Always uses text-enabled dataset classes (*DatasetWithText) to ensure
330 | compatibility with all model variants, including text-based models.
331 |
332 | Args:
333 | dataset_name: Name of dataset ('oxford' or 'nclt')
334 | data_dir: Path to dataset directory
335 | sensor_setup: List of sensors/modalities to load
336 |
337 | Returns:
338 | Dataset instance ready for testing (text-enabled)
339 | """
340 | logger.info(f"Creating {dataset_name} dataset from: {data_dir}")
341 | logger.debug(f"Sensor setup: {sensor_setup}")
342 |
343 | if dataset_name == "oxford":
344 | dataset = OxfordDatasetWithText(
345 | dataset_root=data_dir,
346 | subset="test",
347 | data_to_load=sensor_setup,
348 | pointcloud_quantization_size=0.01,
349 | )
350 | elif dataset_name == "nclt":
351 | dataset = NCLTDatasetWithText(
352 | dataset_root=data_dir,
353 | subset="test",
354 | data_to_load=sensor_setup,
355 | )
356 | else:
357 | raise ValueError(f"Unknown dataset: {dataset_name}")
358 |
359 | logger.info(f"Dataset created with {len(dataset)} samples")
360 | return dataset
361 |
362 |
363 | def evaluate_model(
364 | model: torch.nn.Module,
365 | dataloader: DataLoader,
366 | device: str,
367 | distance_threshold: float = 25.0,
368 | memory_batch_size: int | None = None,
369 | verbose: bool = True,
370 | ) -> tuple[dict[str, float], RetrievalResultsCollection]:
371 | """
372 | Evaluate model performance using comprehensive ModelTester.
373 |
374 | This function leverages the advanced ModelTester class to provide detailed
375 | place recognition analysis beyond simple aggregate metrics. It supports
376 | memory-efficient evaluation and returns comprehensive results suitable
377 | for research analysis and reproducibility.
378 |
379 | Args:
380 | model: PyTorch model to evaluate
381 | dataloader: DataLoader for test data
382 | device: Device to run evaluation on
383 | distance_threshold: Distance threshold for positive matches (meters)
384 | memory_batch_size: If specified, compute distance matrix in batches
385 | to reduce peak memory usage. Useful for large datasets.
386 | verbose: Whether to show detailed progress information
387 |
388 | Returns:
389 | Tuple containing:
390 | - dict: Aggregate metrics (recall_at_n, recall_at_one_percent, etc.)
391 | - RetrievalResultsCollection: Detailed per-query results for analysis
392 |
393 | Note:
394 | The memory_batch_size parameter trades computation speed for memory
395 | efficiency. For datasets with >10k samples, consider using batch
396 | sizes of 1000-5000 depending on available RAM.
397 | """
398 | logger.info("Starting comprehensive model evaluation with ModelTester...")
399 |
400 | # Initialize ModelTester with memory-efficient settings
401 | tester = ModelTester(
402 | model=model,
403 | dataloader=dataloader,
404 | dist_thresh=distance_threshold,
405 | at_n=25, # Standard benchmark value
406 | device=device,
407 | verbose=verbose,
408 | batch_size=memory_batch_size, # Enable memory-efficient computation
409 | )
410 |
411 | # Run comprehensive evaluation
412 | results_collection = tester.run()
413 |
414 | # Extract aggregate metrics for backward compatibility
415 | aggregate_metrics = results_collection.aggregate_metrics()
416 |
417 | # Convert to format expected by existing display logic
418 | recall_at_n_array = aggregate_metrics["recall_at_n"]
419 | recall_at_one_percent = aggregate_metrics["recall_at_one_percent"]
420 |
421 | # For top1_distance, use the aggregate value or compute if None
422 | top1_distance = aggregate_metrics.get("top1_distance")
423 | if top1_distance is None:
424 | # Fallback: compute mean embedding distance of correct top-1 matches
425 | top1_distances = []
426 | for result in results_collection.results:
427 | if result.queries_with_matches > 0 and result.top1_distance is not None:
428 | top1_distances.append(result.top1_distance)
429 | top1_distance = sum(top1_distances) / len(top1_distances) if top1_distances else 0.0
430 |
431 | # Create backward-compatible metrics dict
432 | backward_compatible_metrics = {
433 | "recall_at_n": recall_at_n_array,
434 | "recall_at_one_percent": recall_at_one_percent,
435 | "mean_top1_descriptor_distance": top1_distance,
436 | }
437 |
438 | logger.info("Comprehensive model evaluation completed")
439 | logger.info(f"Processed {results_collection.num_pairs} track pairs with "
440 | f"{results_collection.num_queries} total queries")
441 |
442 | return backward_compatible_metrics, results_collection
443 |
444 |
445 | def save_evaluation_results(
446 | results_collection: RetrievalResultsCollection,
447 | dataset_name: str,
448 | model_name: str,
449 | results_dir: Path,
450 | additional_metadata: dict | None = None,
451 | ) -> Path:
452 | """
453 | Save detailed evaluation results to disk for later analysis.
454 |
455 | Creates a structured filename and saves both the raw results collection
456 | and additional metadata for research reproducibility.
457 |
458 | Args:
459 | results_collection: Detailed results from ModelTester
460 | dataset_name: Name of the evaluated dataset
461 | model_name: Name of the evaluated model
462 | results_dir: Directory to save results
463 | additional_metadata: Optional dict with extra information to save
464 |
465 | Returns:
466 | Path to the saved results file
467 |
468 | Note:
469 | Results are saved in JSON format with timestamp for uniqueness.
470 | The file includes both detailed per-query results and aggregate metrics.
471 | """
472 | from datetime import datetime
473 |
474 | # Create results directory if it doesn't exist
475 | results_dir.mkdir(parents=True, exist_ok=True)
476 |
477 | # Generate timestamped filename
478 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
479 | filename = f"{dataset_name}_{model_name}_results_{timestamp}.json"
480 | results_path = results_dir / filename
481 |
482 | logger.info(f"Saving detailed evaluation results to: {results_path}")
483 |
484 | # Save the results collection (includes built-in JSON serialization)
485 | results_collection.save(str(results_path))
486 |
487 | # If metadata provided, save it alongside
488 | if additional_metadata:
489 | metadata_path = results_dir / f"{dataset_name}_{model_name}_metadata_{timestamp}.json"
490 | import json
491 | with open(metadata_path, 'w') as f:
492 | json.dump(additional_metadata, f, indent=2)
493 | logger.info(f"Saved evaluation metadata to: {metadata_path}")
494 |
495 | return results_path
496 |
497 |
498 | def format_percentage(value: float) -> str:
499 | """
500 | Format a decimal value as a percentage with 2 decimal places (truncated, not rounded).
501 |
502 | Args:
503 | value: Decimal value between 0 and 1
504 |
505 | Returns:
506 | Formatted percentage string
507 | """
508 | # Truncate to 2 decimal places without rounding, as in original notebook
509 | integer_part = int(value * 100)
510 | decimal_part = int((value * 100) % 1 * 100)
511 | return f"{integer_part}.{decimal_part:02d}%"
512 |
513 |
514 | def main() -> None:
515 | """
516 | Main function that orchestrates the checkpoint testing process.
517 | """
518 | parser = argparse.ArgumentParser(
519 | description="Test MSSPlace model checkpoints on Oxford and NCLT datasets",
520 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
521 | )
522 |
523 | parser.add_argument(
524 | "--dataset",
525 | type=str,
526 | choices=DATASET_CHOICES,
527 | required=True,
528 | help="Dataset to test on"
529 | )
530 |
531 | parser.add_argument(
532 | "--model",
533 | type=str,
534 | choices=MODEL_CHOICES,
535 | required=True,
536 | help="Model variant to test"
537 | )
538 |
539 | parser.add_argument(
540 | "--datasets-dir",
541 | type=Path,
542 | default=Path("/home/docker_mssplace/Datasets"),
543 | help="Path to datasets directory"
544 | )
545 |
546 | parser.add_argument(
547 | "--checkpoints-dir",
548 | type=Path,
549 | default=Path(__file__).parent.parent.parent / "checkpoints",
550 | help="Path to checkpoints directory"
551 | )
552 |
553 | parser.add_argument(
554 | "--configs-dir",
555 | type=Path,
556 | default=Path(__file__).parent.parent.parent / "configs" / "model",
557 | help="Path to model configs directory"
558 | )
559 |
560 | parser.add_argument(
561 | "--batch-size",
562 | type=int,
563 | default=32,
564 | help="Batch size for evaluation"
565 | )
566 |
567 | parser.add_argument(
568 | "--num-workers",
569 | type=int,
570 | default=4,
571 | help="Number of dataloader workers"
572 | )
573 |
574 | parser.add_argument(
575 | "--distance-threshold",
576 | type=float,
577 | default=25.0,
578 | help="Distance threshold for positive matches (meters)"
579 | )
580 |
581 | parser.add_argument(
582 | "--device",
583 | type=str,
584 | default="cuda" if torch.cuda.is_available() else "cpu",
585 | help="Device to run evaluation on"
586 | )
587 |
588 | parser.add_argument(
589 | "--verbose",
590 | action="store_true",
591 | help="Enable verbose logging"
592 | )
593 |
594 | parser.add_argument(
595 | "--memory-batch-size",
596 | type=int,
597 | default=None,
598 | help="Batch size for memory-efficient distance computation (reduces memory usage)"
599 | )
600 |
601 | parser.add_argument(
602 | "--save-results",
603 | action="store_true",
604 | help="Save detailed evaluation results to JSON file for later analysis"
605 | )
606 |
607 | parser.add_argument(
608 | "--results-dir",
609 | type=Path,
610 | default=Path("./evaluation_results"),
611 | help="Directory to save detailed evaluation results"
612 | )
613 |
614 | args = parser.parse_args()
615 |
616 | # Setup logging
617 | setup_logging(args.verbose)
618 | logger.info(f"Starting checkpoint testing for {args.model} on {args.dataset}")
619 |
620 | try:
621 | # Validate all required paths exist
622 | validate_paths(args.datasets_dir, args.checkpoints_dir, args.configs_dir)
623 |
624 | # Get specific paths for this dataset/model combination
625 | checkpoint_name = CHECKPOINT_NAMES[args.dataset][args.model]
626 | config_name = MODEL_CONFIG_NAMES[args.model]
627 | sensor_setup = SENSOR_SETUPS[args.dataset][args.model]
628 |
629 | checkpoint_path = args.checkpoints_dir / checkpoint_name
630 | config_path = args.configs_dir / config_name
631 | data_dir = get_dataset_path(args.dataset, args.datasets_dir)
632 |
633 | # Load checkpoint and config
634 | checkpoint = load_checkpoint(checkpoint_path, device="cpu")
635 | config = load_model_config(config_path)
636 |
637 | # Initialize model and load weights
638 | logger.info("Initializing model...")
639 | model = instantiate(config)
640 | model.load_state_dict(checkpoint, strict=True)
641 |
642 | num_parameters = sum(p.numel() for p in model.parameters())
643 | logger.info(f"Model loaded successfully with {num_parameters:,} parameters")
644 |
645 | # Create dataset and dataloader
646 | dataset = create_dataset(args.dataset, data_dir, sensor_setup)
647 | dataloader = DataLoader(
648 | dataset=dataset,
649 | batch_size=args.batch_size,
650 | shuffle=False,
651 | num_workers=args.num_workers,
652 | pin_memory=True if args.device == "cuda" else False,
653 | collate_fn=dataset.collate_fn,
654 | drop_last=False,
655 | )
656 |
657 | # Evaluate model using comprehensive ModelTester
658 | metrics, results_collection = evaluate_model(
659 | model=model,
660 | dataloader=dataloader,
661 | device=args.device,
662 | distance_threshold=args.distance_threshold,
663 | memory_batch_size=args.memory_batch_size,
664 | verbose=args.verbose,
665 | )
666 |
667 | # Extract metrics for display (backward compatibility)
668 | recall_at_n = metrics["recall_at_n"]
669 | recall_at_one_percent = metrics["recall_at_one_percent"]
670 | mean_top1_descriptor_distance = metrics["mean_top1_descriptor_distance"]
671 |
672 | # Optionally save detailed results for research analysis
673 | if args.save_results:
674 | evaluation_metadata = {
675 | "dataset": args.dataset,
676 | "model": args.model,
677 | "device": args.device,
678 | "distance_threshold": args.distance_threshold,
679 | "batch_size": args.batch_size,
680 | "num_workers": args.num_workers,
681 | "memory_batch_size": args.memory_batch_size,
682 | "model_parameters": num_parameters,
683 | "dataset_size": len(dataset),
684 | "evaluation_timestamp": datetime.now().isoformat(),
685 | }
686 |
687 | results_path = save_evaluation_results(
688 | results_collection=results_collection,
689 | dataset_name=args.dataset,
690 | model_name=args.model,
691 | results_dir=args.results_dir,
692 | additional_metadata=evaluation_metadata,
693 | )
694 |
695 | logger.info(f"Detailed results saved for future analysis: {results_path}")
696 |
697 | # Display results with enhanced information
698 | print("\n" + "="*60)
699 | print("COMPREHENSIVE EVALUATION RESULTS")
700 | print("="*60)
701 | print(f"Dataset: {args.dataset}")
702 | print(f"Model: {args.model}")
703 | print(f"Device: {args.device}")
704 | print(f"Distance threshold: {args.distance_threshold}m")
705 | if args.memory_batch_size:
706 | print(f"Memory batch size: {args.memory_batch_size} (memory-efficient mode)")
707 | print("-"*60)
708 |
709 | # Traditional metrics (backward compatibility)
710 | print("PLACE RECOGNITION METRICS:")
711 | print(f" AR@1 = {format_percentage(recall_at_n[0])}")
712 | print(f" AR@1% = {format_percentage(recall_at_one_percent)}")
713 | print(f" Mean top-1 descriptor distance: {mean_top1_descriptor_distance:.6f}")
714 |
715 | # Enhanced insights from detailed analysis
716 | print("\nDETAILED ANALYSIS:")
717 | aggregate_metrics = results_collection.aggregate_metrics()
718 | print(f" Total track pairs evaluated: {results_collection.num_pairs}")
719 | print(f" Total query samples: {results_collection.num_queries}")
720 | print(f" Queries with ground truth matches: {aggregate_metrics['queries_with_matches']}")
721 | print(f" Overall accuracy (correct top-1): {format_percentage(aggregate_metrics['overall_accuracy'])}")
722 |
723 | # Additional recall metrics for research insight
724 | if len(recall_at_n) >= 5:
725 | print(f" AR@5 = {format_percentage(recall_at_n[4])}")
726 | if len(recall_at_n) >= 10:
727 | print(f" AR@10 = {format_percentage(recall_at_n[9])}")
728 |
729 | print("="*60)
730 |
731 | logger.info("Checkpoint testing completed successfully")
732 |
733 | except Exception as e:
734 | logger.error(f"Error during checkpoint testing: {e}")
735 | sys.exit(1)
736 |
737 |
738 | if __name__ == "__main__":
739 | main()
740 |
--------------------------------------------------------------------------------
/src/mssplace/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmelekhin/MSSPlace/9ac48de75ee6a4ea01dac99d336d3a529bd73b61/src/mssplace/__init__.py
--------------------------------------------------------------------------------
/src/mssplace/datasets.py:
--------------------------------------------------------------------------------
1 | """Datasets implementation."""
2 | from pathlib import Path
3 | from typing import Any, Dict, List, Literal, Optional, Tuple, Union
4 |
5 | import cv2
6 | import MinkowskiEngine as ME # type: ignore
7 | import numpy as np
8 | import torch
9 | from torch import Tensor
10 | from opr.datasets.base import BasePlaceRecognitionDataset
11 | from opr.utils import cartesian_to_spherical
12 |
13 |
14 | def collate_data_dict(
15 | dataset: BasePlaceRecognitionDataset, data_list: List[Dict[str, Tensor]]
16 | ) -> Dict[str, Tensor]:
17 | """Pack input data list into batch."""
18 | result: Dict[str, Tensor] = {}
19 | result["idxs"] = torch.stack([e["idx"] for e in data_list], dim=0)
20 | for data_key in data_list[0].keys():
21 | if data_key == "idx":
22 | continue
23 | elif data_key == "utm":
24 | result["utms"] = torch.stack([e["utm"] for e in data_list], dim=0)
25 | elif data_key.startswith("image_"):
26 | result[f"images_{data_key[6:]}"] = torch.stack([e[data_key] for e in data_list])
27 | elif data_key.startswith("mask_"):
28 | result[f"masks_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list])
29 | elif data_key.startswith("text_"):
30 | result[f"texts_{data_key[5:]}"] = torch.stack([e[data_key] for e in data_list])
31 | elif data_key == "pointcloud_lidar_coords":
32 | coords_list = [e["pointcloud_lidar_coords"] for e in data_list]
33 | feats_list = [e["pointcloud_lidar_feats"] for e in data_list]
34 | n_points = [int(e.shape[0]) for e in coords_list]
35 | coords_tensor = torch.cat(coords_list, dim=0).unsqueeze(0) # (1,batch_size*n_points,3)
36 | if dataset.pointcloud_set_transform is not None:
37 | # Apply the same transformation on all dataset elements
38 | coords_tensor = dataset.pointcloud_set_transform(coords_tensor)
39 | coords_list = torch.split(coords_tensor.squeeze(0), split_size_or_sections=n_points, dim=0)
40 | quantized_coords_list = []
41 | quantized_feats_list = []
42 | for coords, feats in zip(coords_list, feats_list):
43 | quantized_coords, quantized_feats = ME.utils.sparse_quantize(
44 | coordinates=coords,
45 | features=feats,
46 | quantization_size=dataset._pointcloud_quantization_size,
47 | )
48 | quantized_coords_list.append(quantized_coords)
49 | quantized_feats_list.append(quantized_feats)
50 |
51 | result["pointclouds_lidar_coords"] = ME.utils.batched_coordinates(quantized_coords_list)
52 | result["pointclouds_lidar_feats"] = torch.cat(quantized_feats_list)
53 | elif data_key == "pointcloud_lidar_feats":
54 | continue
55 | else:
56 | raise ValueError(f"Unknown data key: {data_key!r}")
57 | return result
58 |
59 |
60 | class NCLTDatasetWithText(BasePlaceRecognitionDataset):
61 | """NCLT dataset implementation with text embeddings."""
62 |
63 | _images_dirname: str
64 | _masks_dirname: str
65 | _pointclouds_dirname: str
66 | _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]]
67 | _max_point_distance: Optional[float]
68 | _spherical_coords: bool
69 | _use_intensity_values: bool
70 | _valid_data: Tuple[str, ...] = (
71 | "image_Cam0",
72 | "image_Cam1",
73 | "image_Cam2",
74 | "image_Cam3",
75 | "image_Cam4",
76 | "image_Cam5",
77 | "pointcloud_lidar",
78 | "mask_Cam0",
79 | "mask_Cam1",
80 | "mask_Cam2",
81 | "mask_Cam3",
82 | "mask_Cam4",
83 | "mask_Cam5",
84 | "text_Cam0",
85 | "text_Cam1",
86 | "text_Cam2",
87 | "text_Cam3",
88 | "text_Cam4",
89 | "text_Cam5",
90 | )
91 |
92 | def __init__(
93 | self,
94 | dataset_root: Union[str, Path],
95 | subset: Literal["train", "val", "test"],
96 | data_to_load: Union[str, Tuple[str, ...]],
97 | positive_threshold: float = 10.0,
98 | negative_threshold: float = 50.0,
99 | images_dirname: str = "images_small",
100 | masks_dirname: str = "segmentation_masks_small",
101 | text_embeddings_dirname: str = "clip-vit-base-patch32",
102 | pointclouds_dirname: str = "velodyne_data",
103 | pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.5,
104 | max_point_distance: Optional[float] = None,
105 | spherical_coords: bool = False,
106 | use_intensity_values: bool = False,
107 | image_transform: Optional[Any] = None,
108 | semantic_transform: Optional[Any] = None,
109 | pointcloud_transform: Optional[Any] = None,
110 | pointcloud_set_transform: Optional[Any] = None,
111 | ) -> None:
112 | """NCLT dataset implementation.
113 |
114 | Args:
115 | dataset_root (Union[str, Path]): Path to the dataset root directory.
116 | subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train".
117 | data_to_load (Union[str, Tuple[str, ...]]): The list of data to load.
118 | Check the documentation for the list of available data.
119 | positive_threshold (float): The UTM distance threshold value for positive samples.
120 | Defaults to 10.0.
121 | negative_threshold (float): The UTM distance threshold value for negative samples.
122 | Defaults to 50.0.
123 | images_dirname (str): Images directory name. It should be specified explicitly
124 | if custom preprocessing was done. Defaults to "images".
125 | masks_dirname (str): Masks directory name. It should be specified explicitly
126 | if custom preprocessing was done. Defaults to "segmentation_masks".
127 | text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32".
128 | pointclouds_dirname (str): Point clouds directory name. It should be specified
129 | explicitly if custom preprocessing was done. Defaults to "velodyne_data".
130 | pointcloud_quantization_size (float, optional): The quantization size for point clouds.
131 | Defaults to 0.01.
132 | max_point_distance (float, optional): The maximum distance of points from the origin.
133 | Defaults to None.
134 | spherical_coords (bool): Whether to use spherical coordinates for point clouds.
135 | Defaults to False.
136 | use_intensity_values (bool): Whether to use intensity values for point clouds. Defaults to False.
137 | image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used.
138 | Defaults to None.
139 | semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform
140 | will be used. Defaults to None.
141 | pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform
142 | will be used. Defaults to None.
143 | pointcloud_set_transform (Any, optional): Point clouds set transform. If None,
144 | DefaultCloudSetTransform will be used. Defaults to None.
145 |
146 | Raises:
147 | ValueError: If data_to_load contains invalid data source names.
148 | FileNotFoundError: If images, masks or pointclouds directory does not exist.
149 | """
150 | super().__init__(
151 | dataset_root,
152 | subset,
153 | data_to_load,
154 | positive_threshold,
155 | negative_threshold,
156 | image_transform,
157 | semantic_transform,
158 | pointcloud_transform,
159 | pointcloud_set_transform,
160 | )
161 |
162 | if subset == "test":
163 | self.dataset_df["in_query"] = True # for compatibility with Oxford Dataset
164 |
165 | if any(elem not in self._valid_data for elem in self.data_to_load):
166 | raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}")
167 |
168 | _track_name = self.dataset_df.iloc[0]["track"]
169 |
170 | if any(elem.startswith("image") for elem in self.data_to_load):
171 | self._images_dirname = images_dirname
172 | if not (self.dataset_root / _track_name / self._images_dirname).exists():
173 | raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.")
174 |
175 | if any(elem.startswith("mask") for elem in self.data_to_load):
176 | self._masks_dirname = masks_dirname
177 | if not (self.dataset_root / _track_name / self._masks_dirname).exists():
178 | raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.")
179 |
180 | if any(elem.startswith("text") for elem in self.data_to_load):
181 | self._text_embeddings_dirname = text_embeddings_dirname
182 | if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists():
183 | raise FileNotFoundError(
184 | f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist."
185 | )
186 |
187 | if "pointcloud_lidar" in self.data_to_load:
188 | self._pointclouds_dirname = pointclouds_dirname
189 | if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists():
190 | raise FileNotFoundError(
191 | f"Pointclouds directory {self._pointclouds_dirname!r} does not exist."
192 | )
193 |
194 | self._pointcloud_quantization_size = pointcloud_quantization_size
195 | self._max_point_distance = max_point_distance
196 | self._spherical_coords = spherical_coords
197 | self._use_intensity_values = use_intensity_values
198 |
199 | def __getitem__(self, idx: int) -> Dict[str, Tensor]: # noqa: D105
200 | row = self.dataset_df.iloc[idx]
201 | data = {"idx": torch.tensor(idx, dtype=int)}
202 | data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64))
203 | track_dir = self.dataset_root / str(row["track"])
204 |
205 | for data_source in self.data_to_load:
206 | if data_source.startswith("image_"):
207 | cam_name = data_source[6:] # remove "image_" prefix
208 | image_ts = int(row["image"])
209 | im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png"
210 | im = cv2.imread(str(im_filepath))
211 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
212 | im = self.image_transform(im)
213 | data[data_source] = im
214 | elif data_source.startswith("mask_"):
215 | cam_name = data_source[5:] # remove "mask_" prefix
216 | image_ts = int(row["image"])
217 | mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png"
218 | mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED)
219 | mask = self.semantic_transform(mask)
220 | data[data_source] = mask
221 | elif data_source.startswith("text_"):
222 | cam_name = data_source[5:] # remove "text_" prefix
223 | image_ts = int(row["image"])
224 | text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt"
225 | text_embedding = torch.load(text_filepath, map_location="cpu").squeeze()
226 | data[data_source] = text_embedding
227 | elif data_source == "pointcloud_lidar":
228 | pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin"
229 | pointcloud = self._load_pc(pc_filepath)
230 | data[f"{data_source}_coords"] = self.pointcloud_transform(pointcloud[:, :3])
231 | if self._use_intensity_values:
232 | data[f"{data_source}_feats"] = pointcloud[:, 3].unsqueeze(1)
233 | else:
234 | data[f"{data_source}_feats"] = torch.ones_like(pointcloud[:, :1])
235 |
236 | return data
237 |
238 | def _load_pc(self, filepath: Union[str, Path]) -> Tensor:
239 | if self._use_intensity_values:
240 | raise NotImplementedError("Intensity values are not supported yet.")
241 | pc = np.fromfile(filepath, dtype=np.float32).reshape(-1, 3)
242 | if self._max_point_distance is not None:
243 | pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance]
244 | if self._spherical_coords:
245 | pc = cartesian_to_spherical(pc, dataset_name="nclt")
246 | pc_tensor = torch.tensor(pc, dtype=torch.float)
247 | return pc_tensor
248 |
249 | def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
250 | """Pack input data list into batch.
251 |
252 | Args:
253 | data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader.
254 |
255 | Returns:
256 | Dict[str, Tensor]: dictionary of batched data.
257 | """
258 | return collate_data_dict(self, data_list)
259 |
260 |
261 | class OxfordDatasetWithText(BasePlaceRecognitionDataset):
262 | """PointNetVLAD Oxford RobotCar dataset implementation with text embeddings."""
263 |
264 | _images_dirname: str
265 | _masks_dirname: str
266 | _pointclouds_dirname: str
267 | _pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]]
268 | _max_point_distance: Optional[float]
269 | _spherical_coords: bool
270 | _valid_data: Tuple[str, ...] = (
271 | "image_stereo_centre",
272 | "image_mono_left",
273 | "image_mono_rear",
274 | "image_mono_right",
275 | "pointcloud_lidar",
276 | "mask_stereo_centre",
277 | "mask_mono_left",
278 | "mask_mono_rear",
279 | "mask_mono_right",
280 | "text_stereo_centre",
281 | "text_mono_left",
282 | "text_mono_rear",
283 | "text_mono_right",
284 | )
285 |
286 | def __init__(
287 | self,
288 | dataset_root: Union[str, Path],
289 | subset: Literal["train", "val", "test"],
290 | data_to_load: Union[str, Tuple[str, ...]],
291 | positive_threshold: float = 10.0,
292 | negative_threshold: float = 50.0,
293 | images_dirname: str = "images_small",
294 | masks_dirname: str = "segmentation_masks_small",
295 | text_embeddings_dirname: str = "clip-vit-base-patch32",
296 | pointclouds_dirname: Optional[str] = None,
297 | pointcloud_quantization_size: Optional[Union[float, Tuple[float, float, float]]] = 0.01,
298 | max_point_distance: Optional[float] = None,
299 | spherical_coords: bool = False,
300 | image_transform: Optional[Any] = None,
301 | semantic_transform: Optional[Any] = None,
302 | pointcloud_transform: Optional[Any] = None,
303 | pointcloud_set_transform: Optional[Any] = None,
304 | ) -> None:
305 | """Oxford RobotCar dataset implementation.
306 |
307 | Original dataset site: https://robotcar-dataset.robots.ox.ac.uk/
308 |
309 | We use the preprocessed version of the dataset that was introduced
310 | in PointNetVLAD paper: https://arxiv.org/abs/1804.03492.
311 |
312 | Args:
313 | dataset_root (Union[str, Path]): Path to the dataset root directory.
314 | subset (Literal["train", "val", "test"]): Current subset to load. Defaults to "train".
315 | data_to_load (Union[str, Tuple[str, ...]]): The list of data to load.
316 | Check the documentation for the list of available data.
317 | positive_threshold (float): The UTM distance threshold value for positive samples.
318 | Defaults to 10.0.
319 | negative_threshold (float): The UTM distance threshold value for negative samples.
320 | Defaults to 50.0.
321 | images_dirname (str): Images directory name. It should be specified explicitly
322 | if custom preprocessing was done. Defaults to "images_small".
323 | masks_dirname (str): Masks directory name. It should be specified explicitly
324 | if custom preprocessing was done. Defaults to "segmentation_masks_small".
325 | text_embeddings_dirname (str): Text embeddings directory name. Defaults to "clip-vit-base-patch32".
326 | pointclouds_dirname (Optional[str]): Point clouds directory name. It should be specified
327 | explicitly if custom preprocessing was done. Defaults to None, which sets the dirnames
328 | like in original PointNetVLAD dataset configuration.
329 | pointcloud_quantization_size (float, optional): The quantization size for point clouds.
330 | Defaults to 0.01.
331 | max_point_distance (float, optional): The maximum distance of points from the origin.
332 | Defaults to None.
333 | spherical_coords (bool): Whether to use spherical coordinates for point clouds.
334 | Defaults to False.
335 | image_transform (Any, optional): Images transform. If None, DefaultImageTransform will be used.
336 | Defaults to None.
337 | semantic_transform (Any, optional): Semantic masks transform. If None, DefaultSemanticTransform
338 | will be used. Defaults to None.
339 | pointcloud_transform (Any, optional): Point clouds transform. If None, DefaultCloudTransform
340 | will be used. Defaults to None.
341 | pointcloud_set_transform (Any, optional): Point clouds set transform. If None,
342 | DefaultCloudSetTransform will be used. Defaults to None.
343 |
344 | Raises:
345 | ValueError: If data_to_load contains invalid data source names.
346 | FileNotFoundError: If images, masks or pointclouds directory does not exist.
347 | """
348 | super().__init__(
349 | dataset_root,
350 | subset,
351 | data_to_load,
352 | positive_threshold,
353 | negative_threshold,
354 | image_transform,
355 | semantic_transform,
356 | pointcloud_transform,
357 | pointcloud_set_transform,
358 | )
359 |
360 | if any(elem not in self._valid_data for elem in self.data_to_load):
361 | raise ValueError(f"Invalid data_to_load argument. Valid data list: {self._valid_data!r}")
362 |
363 | _track_name = self.dataset_df.iloc[0]["track"]
364 |
365 | if any(elem.startswith("image") for elem in self.data_to_load):
366 | self._images_dirname = images_dirname
367 | if not (self.dataset_root / _track_name / self._images_dirname).exists():
368 | raise FileNotFoundError(f"Images directory {self._images_dirname!r} does not exist.")
369 |
370 | if any(elem.startswith("mask") for elem in self.data_to_load):
371 | self._masks_dirname = masks_dirname
372 | if not (self.dataset_root / _track_name / self._masks_dirname).exists():
373 | raise FileNotFoundError(f"Masks directory {self._masks_dirname!r} does not exist.")
374 |
375 | if any(elem.startswith("text") for elem in self.data_to_load):
376 | self._text_embeddings_dirname = text_embeddings_dirname
377 | if not (self.dataset_root / _track_name / self._text_embeddings_dirname).exists():
378 | raise FileNotFoundError(
379 | f"Text embeddings directory {self._text_embeddings_dirname!r} does not exist."
380 | )
381 |
382 | if "pointcloud_lidar" in self.data_to_load:
383 | if pointclouds_dirname is not None:
384 | self._pointclouds_dirname = pointclouds_dirname
385 | elif subset in ("train", "val"):
386 | self._pointclouds_dirname = "pointcloud_20m_10overlap"
387 | else:
388 | self._pointclouds_dirname = "pointcloud_20m"
389 | if not (self.dataset_root / _track_name / self._pointclouds_dirname).exists():
390 | raise FileNotFoundError(
391 | f"Pointclouds directory {self._pointclouds_dirname!r} does not exist."
392 | )
393 |
394 | self._pointcloud_quantization_size = pointcloud_quantization_size
395 | self._max_point_distance = max_point_distance
396 | self._spherical_coords = spherical_coords
397 |
398 | def __getitem__(self, idx: int) -> Dict[str, Tensor]: # noqa: D105
399 | row = self.dataset_df.iloc[idx]
400 | data = {"idx": torch.tensor(idx, dtype=int)}
401 | data["utm"] = torch.tensor(row[["northing", "easting"]].to_numpy(dtype=np.float64))
402 | track_dir = self.dataset_root / str(row["track"])
403 |
404 | for data_source in self.data_to_load:
405 | if data_source.startswith("image_"):
406 | cam_name = data_source[6:] # remove "image_" prefix
407 | image_ts = int(row[cam_name])
408 | im_filepath = track_dir / self._images_dirname / f"{cam_name}" / f"{image_ts}.png"
409 | im = cv2.imread(str(im_filepath))
410 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
411 | im = self.image_transform(im)
412 | data[data_source] = im
413 | elif data_source.startswith("mask_"):
414 | cam_name = data_source[5:] # remove "mask_" prefix
415 | image_ts = int(row[cam_name])
416 | mask_filepath = track_dir / self._masks_dirname / f"{cam_name}" / f"{image_ts}.png"
417 | mask = cv2.imread(str(mask_filepath), cv2.IMREAD_UNCHANGED)
418 | mask = self.semantic_transform(mask)
419 | data[data_source] = mask
420 | elif data_source.startswith("text_"):
421 | cam_name = data_source[5:] # remove "text_" prefix
422 | image_ts = int(row[cam_name])
423 | text_filepath = track_dir / self._text_embeddings_dirname / f"{cam_name}" / f"{image_ts}.pt"
424 | text_embedding = torch.load(text_filepath, map_location="cpu").squeeze()
425 | data[data_source] = text_embedding
426 | elif data_source == "pointcloud_lidar":
427 | pc_filepath = track_dir / self._pointclouds_dirname / f"{row['pointcloud']}.bin"
428 | coords = self._load_pc(pc_filepath)
429 | coords = self.pointcloud_transform(coords)
430 | if self._spherical_coords:
431 | raise NotImplementedError("Spherical coords are not implemented yet.")
432 | data[f"{data_source}_coords"] = coords
433 | data[f"{data_source}_feats"] = torch.ones_like(coords[:, :1])
434 |
435 | return data
436 |
437 | def _load_pc(self, filepath: Union[str, Path]) -> Tensor:
438 | pc = np.fromfile(filepath, dtype=np.float64).reshape(-1, 3)
439 | if self._max_point_distance is not None:
440 | pc = pc[np.linalg.norm(pc, axis=1) < self._max_point_distance]
441 | pc_tensor = torch.tensor(pc, dtype=torch.float)
442 | return pc_tensor
443 |
444 | def collate_fn(self, data_list: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
445 | """Pack input data list into batch.
446 |
447 | Args:
448 | data_list (List[Dict[str, Tensor]]): batch data list generated by DataLoader.
449 |
450 | Returns:
451 | Dict[str, Tensor]: dictionary of batched data.
452 | """
453 | return collate_data_dict(self, data_list)
454 |
--------------------------------------------------------------------------------
/src/mssplace/modality_interaction_layers.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import MinkowskiEngine as ME # noqa: N817
4 | import torch
5 | from torch import nn, Tensor
6 | from opr.models.place_recognition.base import ImageModel, SemanticModel, CloudModel
7 | from opr.modules import Concat
8 |
9 | _modalities = ("image", "cloud", "semantic", "text")
10 |
11 |
12 | class LateFusionModel(nn.Module):
13 | """Meta-model for multimodal Place Recognition architectures with late fusion."""
14 |
15 | def __init__(
16 | self,
17 | image_module: Optional[ImageModel] = None,
18 | semantic_module: Optional[SemanticModel] = None,
19 | cloud_module: Optional[CloudModel] = None,
20 | text_module: Optional[nn.Module] = None,
21 | soc_module: Optional[nn.Module] = None,
22 | fusion_module: Optional[nn.Module] = None,
23 | ) -> None:
24 | """Meta-model for multimodal Place Recognition architectures with late fusion.
25 |
26 | Args:
27 | image_module (ImageModule, optional): Image modality branch. Defaults to None.
28 | semantic_module (SemanticModel, optional): Semantic modality branch. Defaults to None.
29 | cloud_module (CloudModule, optional): Cloud modality branch. Defaults to None.
30 | soc_module (nn.Module, optional): Module to fuse different modalities.
31 | fusion_module (FusionModule, optional): Module to fuse different modalities.
32 | If None, will be set to opr.modules.Concat(). Defaults to None.
33 | """
34 | super().__init__()
35 |
36 | self.image_module = image_module
37 | self.semantic_module = semantic_module
38 | self.cloud_module = cloud_module
39 | self.text_module = text_module
40 | self.soc_module = soc_module
41 | if fusion_module:
42 | self.fusion_module = fusion_module
43 | else:
44 | self.fusion_module = Concat()
45 |
46 | def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: # noqa: D102
47 | out_dict: dict[str, Tensor] = {}
48 |
49 | if self.image_module is not None:
50 | out_dict["image"] = self.image_module(batch)["final_descriptor"]
51 |
52 | if self.semantic_module is not None:
53 | out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"]
54 |
55 | if self.cloud_module is not None:
56 | out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"]
57 |
58 | if self.text_module is not None:
59 | out_dict["text"] = self.text_module(batch)["final_descriptor"]
60 |
61 | if self.soc_module is not None:
62 | out_dict["soc"] = self.soc_module(batch)["final_descriptor"]
63 |
64 | out_dict = self.fusion_module(out_dict)
65 |
66 | if not isinstance(out_dict, dict):
67 | out_dict = {"final_descriptor": out_dict}
68 |
69 | return out_dict
70 |
71 |
72 | class MiddleFusionModel(LateFusionModel):
73 | def __init__(
74 | self,
75 | image_module: Optional[ImageModel] = None,
76 | semantic_module: Optional[SemanticModel] = None,
77 | cloud_module: Optional[CloudModel] = None,
78 | soc_module: Optional[nn.Module] = None,
79 | fusion_module: Optional[nn.Module] = None,
80 | ) -> None:
81 | super().__init__(image_module, semantic_module, cloud_module, soc_module, fusion_module)
82 | self.cloud_dim_reduction = ME.MinkowskiAvgPooling(kernel_size=3, stride=3, dimension=3)
83 | self.final_fusion = Concat()
84 |
85 | def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: # noqa: D102
86 | ### step 1: feature extraction
87 | if self.image_module is not None:
88 | img_features = {}
89 | img_features_shapes = {}
90 | for key, value in batch.items():
91 | if key.startswith("images_"):
92 | img_features[key] = self.image_module.backbone(value)
93 | img_features_shapes[key] = img_features[key].shape
94 | img_features[key] = (
95 | img_features[key]
96 | .view(img_features[key].shape[0], img_features[key].shape[1], -1)
97 | .permute(0, 2, 1)
98 | ) # (B, N_feats, Desc_dim)
99 | if self.semantic_module is not None:
100 | semantic_features = {}
101 | semantic_features_shapes = {}
102 | for key, value in batch.items():
103 | if key.startswith("masks_"):
104 | semantic_features[key] = self.semantic_module.backbone(value)
105 | semantic_features_shapes[key] = semantic_features[key].shape
106 | semantic_features[key] = (
107 | semantic_features[key]
108 | .view(semantic_features[key].shape[0], semantic_features[key].shape[1], -1)
109 | .permute(0, 2, 1)
110 | ) # (B, N_feats, Desc_dim)
111 | if self.cloud_module is not None:
112 | sparse_voxel = ME.SparseTensor(
113 | features=batch["pointclouds_lidar_feats"], coordinates=batch["pointclouds_lidar_coords"]
114 | )
115 | sparse_cloud_features = self.cloud_module.backbone(sparse_voxel)
116 | sparse_cloud_features = self.cloud_dim_reduction(sparse_cloud_features)
117 | # TODO: add text model
118 |
119 | ### step 2: transformer interaction
120 | tokens_dict = {}
121 | if self.image_module is not None:
122 | tokens_dict["image"] = torch.cat(list(img_features.values()), dim=1)
123 | if self.semantic_module is not None:
124 | tokens_dict["semantic"] = torch.cat(list(semantic_features.values()), dim=1)
125 | if self.cloud_module is not None:
126 | min_coordinate = torch.tensor(
127 | [
128 | torch.min(sparse_cloud_features.C[:, 1]),
129 | torch.min(sparse_cloud_features.C[:, 2]),
130 | torch.min(sparse_cloud_features.C[:, 3]),
131 | ]
132 | )
133 | dense_cloud_features, _, _ = sparse_cloud_features.dense(min_coordinate=min_coordinate)
134 | dense_cloud_shape = dense_cloud_features.shape
135 | dense_cloud_features = dense_cloud_features.view(
136 | dense_cloud_features.shape[0], dense_cloud_features.shape[1], -1
137 | ).permute(0, 2, 1) # (B, N_feats, Desc_dim)
138 | tokens_dict["cloud"] = dense_cloud_features
139 | tokens_dict = self.fusion_module(tokens_dict)
140 |
141 | ### step 3: back into initial states and finish processing
142 | out_dict = {}
143 | if self.image_module is not None:
144 | image_feat_lens = [s[-1] * s[-2] for s in img_features_shapes.values()]
145 | img_features_list = torch.split(tokens_dict["image"], image_feat_lens, dim=1)
146 | for key, feats in zip(list(img_features.keys()), img_features_list):
147 | img_features[key] = feats.permute(0, 2, 1).view(*img_features_shapes[key])
148 | img_features[key] = self.image_module.head(img_features[key])
149 | out_dict["image"] = self.image_module.fusion(img_features)
150 | if self.cloud_module is not None:
151 | dense_cloud_features = tokens_dict["cloud"].permute(0, 2, 1).view(*dense_cloud_shape)
152 | out_dict["cloud"] = self.cloud_module.head(ME.to_sparse(dense_cloud_features))
153 | out_dict["final_descriptor"] = self.final_fusion(out_dict)
154 | return out_dict
155 |
156 |
157 | class TransformerModalityInteraction(nn.Module):
158 | def __init__(
159 | self,
160 | desc_dim: int = 256,
161 | image: bool = True,
162 | cloud: bool = True,
163 | semantic: bool = False,
164 | text: bool = False,
165 | use_modality_embeddings: bool = False,
166 | n_heads: int = 4,
167 | n_layers: int = 4,
168 | hidden_dim: int = 1024,
169 | dropout: float = 0.0,
170 | activation: str = "gelu",
171 | ) -> None:
172 | super().__init__()
173 |
174 | self.use_modality_embeddings = use_modality_embeddings
175 |
176 | self.modalities = []
177 | if image:
178 | self.modalities.append("image")
179 | if cloud:
180 | self.modalities.append("cloud")
181 | if semantic:
182 | self.modalities.append("semantic")
183 | if text:
184 | self.modalities.append("text")
185 |
186 | if self.use_modality_embeddings:
187 | self.modality_embeddings = nn.ParameterDict(
188 | {
189 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
190 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
191 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
192 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
193 | }
194 | )
195 |
196 | transformer_encoder_layer = nn.TransformerEncoderLayer(
197 | d_model=desc_dim,
198 | nhead=n_heads,
199 | dim_feedforward=hidden_dim,
200 | dropout=dropout,
201 | activation=activation,
202 | batch_first=True,
203 | )
204 | self.transformer_encoder = nn.TransformerEncoder(
205 | transformer_encoder_layer, num_layers=n_layers, enable_nested_tensor=False
206 | )
207 |
208 | def forward(self, data: dict[str, Tensor]) -> Tensor:
209 | descriptors = []
210 |
211 | for key in self.modalities:
212 | if self.use_modality_embeddings:
213 | descriptors.append(data[key] + self.modality_embeddings[key])
214 | else:
215 | descriptors.append(data[key])
216 |
217 | descriptors = torch.stack(descriptors, dim=1)
218 | # desc_lens = [d.shape[1] for d in descriptors]
219 | # descriptors = torch.cat(descriptors, dim=1)
220 | descriptors = torch.unbind(self.transformer_encoder(descriptors), dim=1)
221 | # descriptors = torch.split(self.transformer_encoder(descriptors), desc_lens, dim=1)
222 | out_dict = {}
223 | for i, key in enumerate(self.modalities):
224 | out_dict[key] = descriptors[i]
225 | out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1)
226 | return out_dict
227 |
228 |
229 | class SelfAttentionModalityInteraction(nn.Module):
230 | def __init__(
231 | self,
232 | desc_dim: int = 256,
233 | image: bool = True,
234 | cloud: bool = True,
235 | semantic: bool = False,
236 | text: bool = False,
237 | use_modality_embeddings: bool = False,
238 | n_heads: int = 4,
239 | dropout: float = 0.0,
240 | ) -> None:
241 | super().__init__()
242 |
243 | self.use_modality_embeddings = use_modality_embeddings
244 |
245 | self.modalities = []
246 | if image:
247 | self.modalities.append("image")
248 | if cloud:
249 | self.modalities.append("cloud")
250 | if semantic:
251 | self.modalities.append("semantic")
252 | if text:
253 | self.modalities.append("text")
254 |
255 | if self.use_modality_embeddings:
256 | self.modality_embeddings = nn.ParameterDict(
257 | {
258 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
259 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
260 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
261 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
262 | }
263 | )
264 |
265 | self.self_attention = nn.MultiheadAttention(
266 | embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True
267 | )
268 |
269 | def forward(self, data: dict[str, Tensor]) -> Tensor:
270 | descriptors = []
271 |
272 | for key in self.modalities:
273 | if self.use_modality_embeddings:
274 | descriptors.append(data[key] + self.modality_embeddings[key])
275 | else:
276 | descriptors.append(data[key])
277 |
278 | # descriptors = torch.stack(descriptors, dim=1)
279 | # descriptors = torch.unbind(self.self_attention(descriptors, descriptors, descriptors)[0], dim=1)
280 | desc_lens = [d.shape[1] for d in descriptors]
281 | descriptors = torch.cat(descriptors, dim=1)
282 | descriptors = torch.split(
283 | self.self_attention(descriptors, descriptors, descriptors, need_weights=False)[0],
284 | desc_lens,
285 | dim=1,
286 | )
287 | out_dict = {}
288 | for i, key in enumerate(self.modalities):
289 | out_dict[key] = descriptors[i]
290 | # out_dict["final_descriptor"] = torch.cat(descriptors, dim=-1)
291 | return out_dict
292 |
293 |
294 | class CrossAttentionModalityInteraction(nn.Module):
295 | def __init__(
296 | self,
297 | desc_dim: int = 256,
298 | image: bool = True,
299 | cloud: bool = True,
300 | semantic: bool = False,
301 | text: bool = False,
302 | use_modality_embeddings: bool = False,
303 | n_heads: int = 4,
304 | dropout: float = 0.0,
305 | ) -> None:
306 | super().__init__()
307 |
308 | self.use_modality_embeddings = use_modality_embeddings
309 |
310 | self.modalities = []
311 | if image:
312 | self.modalities.append("image")
313 | if cloud:
314 | self.modalities.append("cloud")
315 | if semantic:
316 | self.modalities.append("semantic")
317 | if text:
318 | self.modalities.append("text")
319 |
320 | if self.use_modality_embeddings:
321 | self.modality_embeddings = nn.ParameterDict(
322 | {
323 | "image": nn.Parameter(torch.randn(desc_dim) * 0.02) if image else None,
324 | "cloud": nn.Parameter(torch.randn(desc_dim) * 0.02) if cloud else None,
325 | "semantic": nn.Parameter(torch.randn(desc_dim) * 0.02) if semantic else None,
326 | "text": nn.Parameter(torch.randn(desc_dim) * 0.02) if text else None,
327 | }
328 | )
329 |
330 | self.cross_attn_dict = nn.ModuleDict({})
331 | for key in self.modalities:
332 | self.cross_attn_dict[key] = nn.MultiheadAttention(
333 | embed_dim=desc_dim, num_heads=n_heads, dropout=dropout, batch_first=True
334 | )
335 |
336 | def forward(self, data: dict[str, Tensor]) -> dict[str, Tensor]:
337 | out_dict = {}
338 |
339 | for query_modality in self.modalities:
340 | query = data[query_modality].unsqueeze(1)
341 | if self.use_modality_embeddings:
342 | query += self.modality_embeddings[query_modality]
343 |
344 | # Prepare keys and values from other modalities
345 | keys = []
346 | for key_modality in self.modalities:
347 | if key_modality != query_modality:
348 | key_value = data[key_modality]
349 | if self.use_modality_embeddings:
350 | key_value += self.modality_embeddings[key_modality]
351 | keys.append(key_value)
352 | # Stack keys and values from all other modalities
353 | keys = values = torch.stack(keys, dim=1)
354 |
355 | # Apply cross-attention
356 | attn_output, _ = self.cross_attn_dict[query_modality](query=query, key=keys, value=values)
357 | out_dict[query_modality] = attn_output
358 |
359 | out_dict["final_descriptor"] = torch.cat(out_dict.values(), dim=-1)
360 |
361 | return out_dict
362 |
--------------------------------------------------------------------------------
/src/mssplace/models.py:
--------------------------------------------------------------------------------
1 | """Models implementation"""
2 | from typing import Dict, Optional
3 |
4 | import torch
5 | from torch import Tensor, nn
6 | from opr.modules import Concat
7 | from opr.modules.gem import SeqGeM
8 |
9 |
10 | class GeMMultiFeatureMapsFusion(nn.Module):
11 | """GeM fusion module for multiple 2D feature maps."""
12 |
13 | def __init__(self, p: int = 3, eps: float = 1e-6) -> None:
14 | """Generalized-Mean fusion module.
15 |
16 | Args:
17 | p (int): Initial value of learnable parameter 'p', see paper for more details. Defaults to 3.
18 | eps (float): Negative values will be clamped to `eps` (ReLU). Defaults to 1e-6.
19 | """
20 | super().__init__()
21 | self.gem = SeqGeM(p=p, eps=eps)
22 |
23 | def forward(self, data: Dict[str, Tensor]) -> Tensor: # noqa: D102
24 | data = {key: value for key, value in data.items() if value is not None}
25 | features = list(data.values())
26 | features = [f.view(f.shape[0], f.shape[1], -1) for f in features]
27 | features = torch.cat(features, dim=-1)
28 | out = self.gem(features)
29 | if len(out.shape) == 1:
30 | out = out.unsqueeze(0)
31 | return out
32 |
33 |
34 | class TextModel(nn.Module):
35 | """Meta-model for text-based Place Recognition."""
36 |
37 | def __init__(
38 | self,
39 | model: nn.Module,
40 | fusion: Optional[nn.Module] = None,
41 | ) -> None:
42 | """Meta-model for text-based Place Recognition.
43 |
44 | Args:
45 | model (nn.Module): Text backbone.
46 | fusion (nn.Module, optional): Module to fuse descriptors for multiple texts in batch.
47 | Defaults to None.
48 | """
49 | super().__init__()
50 | self.model = model
51 | self.fusion = fusion
52 |
53 | def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]: # noqa: D102
54 | text_descriptors = {}
55 | for key, value in batch.items():
56 | if key.startswith("texts_"):
57 | text_descriptors[key] = self.model(value)
58 | if len(text_descriptors) > 1:
59 | if self.fusion is None:
60 | raise ValueError("Fusion module is not defined but multiple texts are provided")
61 | descriptor = self.fusion(text_descriptors)
62 | else:
63 | if self.fusion is not None:
64 | raise ValueError("Fusion module is defined but only one text is provided")
65 | descriptor = list(text_descriptors.values())[0]
66 | out_dict: Dict[str, Tensor] = {"final_descriptor": descriptor}
67 | return out_dict
68 |
69 |
70 | class LateFusionModel(nn.Module):
71 | """Meta-model for multimodal Place Recognition architectures with late fusion."""
72 |
73 | def __init__(
74 | self,
75 | image_module: Optional[nn.Module] = None,
76 | semantic_module: Optional[nn.Module] = None,
77 | cloud_module: Optional[nn.Module] = None,
78 | text_module: Optional[nn.Module] = None,
79 | fusion_module: Optional[nn.Module] = None,
80 | ) -> None:
81 | """Meta-model for multimodal Place Recognition architectures with late fusion.
82 |
83 | Args:
84 | image_module (nn.Module, optional): Image modality branch. Defaults to None.
85 | semantic_module (nn.Module, optional): Semantic modality branch. Defaults to None.
86 | cloud_module (nn.Module, optional): Cloud modality branch. Defaults to None.
87 | text_module (nn.Module, optional): Text modality branch. Defaults to None.
88 | fusion_module (nn.Module, optional): Module to fuse different modalities.
89 | If None, will be set to opr.modules.Concat(). Defaults to None.
90 | """
91 | super().__init__()
92 |
93 | self.image_module = image_module
94 | self.semantic_module = semantic_module
95 | self.cloud_module = cloud_module
96 | self.text_module = text_module
97 | if fusion_module:
98 | self.fusion_module = fusion_module
99 | else:
100 | self.fusion_module = Concat()
101 |
102 | def forward(self, batch: Dict[str, Tensor]) -> Dict[str, Tensor]: # noqa: D102
103 | out_dict: Dict[str, Tensor] = {}
104 |
105 | if self.image_module is not None:
106 | out_dict["image"] = self.image_module(batch)["final_descriptor"]
107 |
108 | if self.semantic_module is not None:
109 | out_dict["semantic"] = self.semantic_module(batch)["final_descriptor"]
110 |
111 | if self.cloud_module is not None:
112 | out_dict["cloud"] = self.cloud_module(batch)["final_descriptor"]
113 |
114 | if self.text_module is not None:
115 | out_dict["text"] = self.text_module(batch)["final_descriptor"]
116 |
117 | out_dict["final_descriptor"] = self.fusion_module(out_dict)
118 |
119 | return out_dict
120 |
--------------------------------------------------------------------------------
/train_unimodal.py:
--------------------------------------------------------------------------------
1 | """Script to train a single-modal Place Recognition model."""
2 | import logging
3 | import pprint
4 | import sys
5 | from datetime import datetime
6 | from pathlib import Path
7 | from typing import Dict, Literal
8 |
9 | import hydra
10 | import wandb
11 | from hydra.utils import instantiate
12 | from loguru import logger
13 | from omegaconf import DictConfig, OmegaConf
14 | from torch.utils.data import DataLoader
15 |
16 | from opr.datasets.dataloader_factory import make_dataloaders
17 | from opr.trainers.place_recognition import UnimodalPlaceRecognitionTrainer
18 | from opr.utils import set_seed
19 |
20 | REPO_ROOT = Path(__file__).resolve().parent
21 |
22 | @hydra.main(config_path="configs", config_name="train_unimodal", version_base=None)
23 | def main(cfg: DictConfig) -> None:
24 | """Training code.
25 |
26 | Args:
27 | cfg (DictConfig): config to train with
28 | """
29 | config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
30 | logger.info(f"Config:\n{pprint.pformat(config_dict, compact=True)}")
31 |
32 | if not cfg.debug and not cfg.wandb.disabled:
33 | config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 | wandb.init(
35 | dir=hydra.core.hydra_config.HydraConfig.get().runtime.output_dir,
36 | name=cfg.exp_name,
37 | project=cfg.wandb.project,
38 | settings=wandb.Settings(start_method="thread"),
39 | config=config_dict,
40 | )
41 | logger.debug(f"Initialized wandb run with name: {wandb.run.name}")
42 |
43 | logger.info(f"Output directory: {hydra.core.hydra_config.HydraConfig.get().runtime.output_dir}")
44 | checkpoints_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir) / "checkpoints"
45 | if not checkpoints_dir.exists():
46 | checkpoints_dir.mkdir(parents=True)
47 |
48 | set_seed(seed=cfg.seed, make_deterministic=False)
49 | logger.info(f"=> Seed: {cfg.seed}")
50 |
51 | logger.debug("=> Instantiating model...")
52 | model = instantiate(cfg.model)
53 |
54 | logger.debug("=> Instantiating loss...")
55 | loss_fn = instantiate(cfg.loss)
56 |
57 | logger.debug("=> Making dataloaders...")
58 | dataloaders: Dict[Literal["train", "val", "test"], DataLoader] = make_dataloaders(
59 | dataset_cfg=cfg.dataset,
60 | batch_sampler_cfg=cfg.sampler,
61 | num_workers=cfg.num_workers,
62 | )
63 |
64 | logger.debug("=> Instantiating optimizer...")
65 | optimizer = instantiate(cfg.optimizer, params=model.parameters())
66 | logger.debug("=> Instantiating scheduler...")
67 | scheduler = instantiate(cfg.scheduler, optimizer=optimizer)
68 |
69 | logger.debug("=> Instantiating trainer...")
70 | trainer = UnimodalPlaceRecognitionTrainer(
71 | checkpoints_dir=checkpoints_dir,
72 | model=model,
73 | loss_fn=loss_fn,
74 | optimizer=optimizer,
75 | scheduler=scheduler,
76 | batch_expansion_threshold=cfg.batch_expansion_threshold,
77 | wandb_log=(not cfg.debug and not cfg.wandb.disabled),
78 | device=cfg.device,
79 | )
80 |
81 | logger.info(f"=====> {trainer.__class__.__name__} is ready, starting training for {cfg.epochs} epochs.")
82 |
83 | trainer.train(
84 | epochs=cfg.epochs,
85 | train_dataloader=dataloaders["train"],
86 | val_dataloader=dataloaders["val"],
87 | test_dataloader=dataloaders["test"],
88 | )
89 |
90 |
91 | if __name__ == "__main__":
92 | run_dir = REPO_ROOT / "outputs" / (r"${exp_name}" + f"_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}")
93 | sys.argv.append(f"hydra.run.dir={run_dir}")
94 | main()
95 |
--------------------------------------------------------------------------------