├── limoe
    ├── __init__.py
    ├── models
    │   ├── backbones
    │   │   ├── dinov2
    │   │   │   ├── __init__.py
    │   │   │   ├── layers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── layer_scale.py
    │   │   │   │   ├── drop_path.py
    │   │   │   │   ├── mlp.py
    │   │   │   │   ├── swiglu_ffn.py
    │   │   │   │   ├── dino_head.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── patch_embed.py
    │   │   │   │   └── block.py
    │   │   │   └── dinov2_vision_transformer.py
    │   │   ├── __init__.py
    │   │   ├── vit.py
    │   │   ├── spvcnn.py
    │   │   └── minkunet.py
    │   ├── selfsup
    │   │   ├── __init__.py
    │   │   └── slidr.py
    │   ├── data_preprocessors
    │   │   ├── __init__.py
    │   │   └── data_preprocessor.py
    │   ├── seg_heads
    │   │   ├── __init__.py
    │   │   ├── upsample_head.py
    │   │   └── linear_head.py
    │   └── __init__.py
    └── datasets
    │   ├── __init__.py
    │   ├── transforms
    │       ├── __init__.py
    │       ├── formatting.py
    │       ├── loading.py
    │       └── transforms.py
    │   └── nuscenes_dataset.py
├── docs
    ├── figs
    │   ├── logo.png
    │   ├── paths.png
    │   ├── heatmaps.png
    │   ├── teaser.png
    │   ├── framework.png
    │   ├── activation1.png
    │   ├── activation2.png
    │   ├── qualitative1.png
    │   └── qualitative2.png
    ├── DATA_PREPAER.md
    ├── GET_STARTED.md
    └── INSTALL.md
├── .gitignore
├── configs
    ├── slidr
    │   ├── slidr_minkunet.py
    │   └── slidr_spvcnn.py
    └── _base_
    │   ├── schedules
    │       └── pretrain.py
    │   ├── default_runtime.py
    │   ├── models
    │       ├── slidr_spvcnn.py
    │       └── slidr_minkunet.py
    │   └── datasets
    │       └── nuscenes_pretrain.py
├── setup.cfg
├── dist_train.sh
├── .pre-commit-config.yaml
├── train.py
├── LICENSE
└── README.md


/limoe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/figs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/logo.png


--------------------------------------------------------------------------------
/docs/figs/paths.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/paths.png


--------------------------------------------------------------------------------
/limoe/models/selfsup/__init__.py:
--------------------------------------------------------------------------------
1 | from .slidr import SLidR
2 | 
3 | __all__ = ['SLidR']
4 | 


--------------------------------------------------------------------------------
/docs/figs/heatmaps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/heatmaps.png


--------------------------------------------------------------------------------
/docs/figs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/teaser.png


--------------------------------------------------------------------------------
/docs/figs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/framework.png


--------------------------------------------------------------------------------
/docs/figs/activation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/activation1.png


--------------------------------------------------------------------------------
/docs/figs/activation2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/activation2.png


--------------------------------------------------------------------------------
/docs/figs/qualitative1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/qualitative1.png


--------------------------------------------------------------------------------
/docs/figs/qualitative2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/qualitative2.png


--------------------------------------------------------------------------------
/limoe/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .nuscenes_dataset import NuScenesSegDataset
2 | 
3 | __all__ = ['NuScenesSegDataset']
4 | 


--------------------------------------------------------------------------------
/limoe/models/data_preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_preprocessor import LiMoEDataPreprocessor
2 | 
3 | __all__ = ['LiMoEDataPreprocessor']
4 | 


--------------------------------------------------------------------------------
/limoe/models/seg_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .linear_head import LinearHead
2 | from .upsample_head import UpsampleHead
3 | 
4 | __all__ = ['UpsampleHead', 'LinearHead']
5 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .minkunet import MinkUNetBackbone
2 | from .spvcnn import SPVCNNBackbone
3 | from .vit import ViT
4 | 
5 | __all__ = ['ViT', 'MinkUNetBackbone', 'SPVCNNBackbone']
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | 
 4 | # Pytorch
 5 | *.pt
 6 | *.pkl
 7 | 
 8 | data/
 9 | output/
10 | logs/
11 | work_dirs/
12 | dinov2_weights/
13 | 
14 | *.DS_Store
15 | 


--------------------------------------------------------------------------------
/configs/slidr/slidr_minkunet.py:
--------------------------------------------------------------------------------
1 | _base_ = [
2 |     '../_base_/datasets/nuscenes_pretrain.py',
3 |     '../_base_/models/slidr_minkunet.py', '../_base_/schedules/pretrain.py',
4 |     '../_base_/default_runtime.py'
5 | ]
6 | 


--------------------------------------------------------------------------------
/configs/slidr/slidr_spvcnn.py:
--------------------------------------------------------------------------------
1 | _base_ = [
2 |     '../_base_/datasets/nuscenes_pretrain.py',
3 |     '../_base_/models/slidr_spvcnn.py', '../_base_/schedules/pretrain.py',
4 |     '../_base_/default_runtime.py'
5 | ]
6 | 


--------------------------------------------------------------------------------
/limoe/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbones import *  # noqa: F401,F403
2 | from .data_preprocessors import *  # noqa: F401,F403
3 | from .seg_heads import *  # noqa: F401,F403
4 | from .selfsup import *  # noqa: F401,F403
5 | 


--------------------------------------------------------------------------------
/limoe/datasets/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .formatting import LiMoEInputs
2 | from .loading import LoadMultiModalityData
3 | from .transforms import FlipHorizontal, ResizedCrop
4 | 
5 | __all__ = [
6 |     'LoadMultiModalityData', 'ResizedCrop', 'FlipHorizontal', 'LiMoEInputs'
7 | ]
8 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .attention import MemEffAttention
 2 | from .block import NestedTensorBlock
 3 | from .dino_head import DINOHead
 4 | from .mlp import Mlp
 5 | from .patch_embed import PatchEmbed
 6 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
 7 | 
 8 | __all__ = [
 9 |     'DINOHead', 'Mlp', 'PatchEmbed', 'SwiGLUFFN', 'SwiGLUFFNFused',
10 |     'NestedTensorBlock', 'MemEffAttention'
11 | ]
12 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [yapf]
 2 | BASED_ON_STYLE = pep8
 3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 5 | 
 6 | [isort]
 7 | line_length = 79
 8 | multi_line_output = 0
 9 | extra_standard_library = setuptools
10 | known_third_party = mmcv,mmengine,mmdet,mmdet3d,numpy,nuscenes
11 | no_lines_before = STDLIB,LOCALFOLDER
12 | default_section = THIRDPARTY
13 | 
14 | [codespell]
15 | ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD,warmup
16 | 


--------------------------------------------------------------------------------
/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --launcher pytorch ${@:3}
20 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import Tensor, nn
 5 | 
 6 | 
 7 | class LayerScale(nn.Module):
 8 | 
 9 |     def __init__(
10 |         self,
11 |         dim: int,
12 |         init_values: Union[float, Tensor] = 1e-5,
13 |         inplace: bool = False,
14 |     ) -> None:
15 |         super().__init__()
16 |         self.inplace = inplace
17 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
18 | 
19 |     def forward(self, x: Tensor) -> Tensor:
20 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
21 | 


--------------------------------------------------------------------------------
/configs/_base_/schedules/pretrain.py:
--------------------------------------------------------------------------------
 1 | lr = 0.01
 2 | optim_wrapper = dict(
 3 |     type='OptimWrapper',
 4 |     optimizer=dict(
 5 |         type='AdamW', lr=lr, betas=(0.9, 0.999), weight_decay=0.01, eps=1e-6))
 6 | 
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='OneCycleLR',
10 |         total_steps=100,
11 |         by_epoch=True,
12 |         eta_max=lr,
13 |         pct_start=0.2,
14 |         div_factor=25.0,
15 |         final_div_factor=100.0,
16 |         convert_to_iter_based=True)
17 | ]
18 | train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=100)
19 | 
20 | auto_scale_lr = dict(enable=False, base_batch_size=32)
21 | 


--------------------------------------------------------------------------------
/docs/DATA_PREPAER.md:
--------------------------------------------------------------------------------
 1 | # Data Preparation
 2 | 
 3 | ## Overall Structure
 4 | 
 5 | ```
 6 | LiMoE
 7 | ├── data
 8 | │   ├── nuscenes
 9 | │   │   ├── lidarseg
10 | │   │   ├── maps
11 | │   │   ├── samples
12 | │   │   ├── sweeps
13 | │   │   ├── v1.0-test
14 | |   |   ├── v1.0-trainval
15 | |   |   ├── superflow_nus_info.pkl
16 | |   |   ├── downstream_nuscenes_infos_train_1.pkl
17 | |   |   ├── downstream_nuscenes_infos_train_5.pkl
18 | |   |   ├── downstream_nuscenes_infos_train_10.pkl
19 | |   |   ├── downstream_nuscenes_infos_train_25.pkl
20 | |   |   ├── downstream_nuscenes_infos_train_100.pkl
21 | |   |   ├── downstream_nuscenes_infos_val.pkl
22 | │   ├── openseed_inst17
23 | ```
24 | 
25 | The `pkl` files and `superpixels` can be downloaded from [huggingface](https://huggingface.co/datasets/Xiangxu-0103/SuperFlow_SuperPixel).
26 | 


--------------------------------------------------------------------------------
/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | default_scope = 'mmdet3d'
 2 | 
 3 | custom_imports = dict(
 4 |     imports=['limoe.datasets', 'limoe.datasets.transforms', 'limoe.models'],
 5 |     allow_failed_imports=False)
 6 | 
 7 | default_hooks = dict(
 8 |     timer=dict(type='IterTimerHook'),
 9 |     logger=dict(type='LoggerHook', interval=50),
10 |     param_scheduler=dict(type='ParamSchedulerHook'),
11 |     checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
12 |     sampler_seed=dict(type='DistSamplerSeedHook'),
13 |     visualization=dict(type='Det3DVisualizationHook'))
14 | 
15 | env_cfg = dict(
16 |     cudnn_benchmark=False,
17 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
18 |     dist_cfg=dict(backend='nccl'))
19 | 
20 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
21 | 
22 | log_level = 'INFO'
23 | load_from = None
24 | resume = False
25 | 


--------------------------------------------------------------------------------
/docs/GET_STARTED.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | Before training, you may encounter an error `MMCV=={xxx} is used but incompatible. Please install mmcv>={xxx}, <{xxx}`. We suggest to modify `__init__.py` under `mmdet` and `mmdet3d` package as follows:
 4 | 
 5 | ```python
 6 | mmcv_maximum_version = '3.0.0'
 7 | ```
 8 | 
 9 | Meanwhile, you should modify `Line 123-124` in `mmdet3d/datasets/seg3d_dataset.py` as follows:
10 | 
11 | ```python
12 | if scene_idxs is not None:
13 |     self.scene_idxs = self.get_scene_idxs(scene_idxs)
14 |     self.data_list = [self.data_list[i] for i in self.scene_idxs]
15 | ```
16 | 
17 | ## Train with a single GPU
18 | 
19 | ```bash
20 | python train.py ${CONFIG_FILE}
21 | ```
22 | 
23 | ## Train with multiple GPUs
24 | 
25 | ```bash
26 | bash dist_train.sh ${CONFIG_FILE} ${GPU_NUM}
27 | ```
28 | 
29 | **Note**: For pretraining phase, we suggest to use 8 GPUs while 4 GPUs for downstream tasks.
30 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
 5 |     if drop_prob == 0.0 or not training:
 6 |         return x
 7 |     keep_prob = 1 - drop_prob
 8 |     shape = (x.shape[0], ) + (1, ) * (
 9 |         x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
10 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
11 |     if keep_prob > 0.0:
12 |         random_tensor.div_(keep_prob)
13 |     output = x * random_tensor
14 |     return output
15 | 
16 | 
17 | class DropPath(nn.Module):
18 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of
19 |     residual blocks)."""
20 | 
21 |     def __init__(self, drop_prob=None):
22 |         super(DropPath, self).__init__()
23 |         self.drop_prob = drop_prob
24 | 
25 |     def forward(self, x):
26 |         return drop_path(x, self.drop_prob, self.training)
27 | 


--------------------------------------------------------------------------------
/limoe/models/seg_heads/upsample_head.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from mmdet3d.registry import MODELS
 3 | from mmdet3d.utils import OptMultiConfig
 4 | from mmengine.model import BaseModule
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | @MODELS.register_module()
 9 | class UpsampleHead(BaseModule):
10 | 
11 |     def __init__(self,
12 |                  in_channels: int,
13 |                  out_channels: int,
14 |                  scale_factor: int,
15 |                  mode: str = 'bilinear',
16 |                  align_corners: bool = True,
17 |                  init_cfg: OptMultiConfig = None) -> None:
18 |         super(UpsampleHead, self).__init__(init_cfg=init_cfg)
19 |         self.head = nn.Sequential(
20 |             nn.Conv2d(in_channels, out_channels, kernel_size=1),
21 |             nn.Upsample(
22 |                 scale_factor=scale_factor,
23 |                 mode=mode,
24 |                 align_corners=align_corners))
25 | 
26 |     def forward(self, x: Tensor) -> Tensor:
27 |         return self.head(x)
28 | 


--------------------------------------------------------------------------------
/configs/_base_/models/slidr_spvcnn.py:
--------------------------------------------------------------------------------
 1 | model = dict(
 2 |     type='SLidR',
 3 |     data_preprocessor=dict(
 4 |         type='LiMoEDataPreprocessor',
 5 |         H=32,
 6 |         W=480,
 7 |         fov_up=10.0,
 8 |         fov_down=-30.0,
 9 |         ignore_index=16,
10 |         voxel_size=[0.1, 1, 0.1],
11 |         voxel_type='cylinder',
12 |         mean=[0.485, 0.456, 0.406],
13 |         std=[0.229, 0.224, 0.225]),
14 |     backbone_3d=dict(
15 |         type='SPVCNNBackbone',
16 |         in_channels=4,
17 |         base_channels=32,
18 |         layers=[2, 3, 4, 6, 2, 2, 2, 2],
19 |         planes=[32, 64, 128, 256, 256, 128, 96, 96],
20 |         block_type='basic',
21 |         bn_momentum=0.05),
22 |     head_3d=dict(
23 |         type='LinearHead', channels=96, num_classes=64, dropout_ratio=0),
24 |     backbone_2d=dict(type='ViT', images_encoder='dinov2_vit_base_p14'),
25 |     head_2d=dict(
26 |         type='UpsampleHead', in_channels=768, out_channels=64,
27 |         scale_factor=14),
28 |     superpixel_size=150,
29 |     temperature=0.07)
30 | 


--------------------------------------------------------------------------------
/configs/_base_/models/slidr_minkunet.py:
--------------------------------------------------------------------------------
 1 | model = dict(
 2 |     type='SLidR',
 3 |     data_preprocessor=dict(
 4 |         type='LiMoEDataPreprocessor',
 5 |         H=32,
 6 |         W=480,
 7 |         fov_up=10.0,
 8 |         fov_down=-30.0,
 9 |         ignore_index=16,
10 |         voxel_size=[0.1, 1, 0.1],
11 |         voxel_type='cylinder',
12 |         mean=[0.485, 0.456, 0.406],
13 |         std=[0.229, 0.224, 0.225]),
14 |     backbone_3d=dict(
15 |         type='MinkUNetBackbone',
16 |         in_channels=4,
17 |         base_channels=32,
18 |         layers=[2, 3, 4, 6, 2, 2, 2, 2],
19 |         planes=[32, 64, 128, 256, 256, 128, 96, 96],
20 |         block_type='basic',
21 |         bn_momentum=0.05),
22 |     head_3d=dict(
23 |         type='LinearHead', channels=96, num_classes=64, dropout_ratio=0),
24 |     backbone_2d=dict(type='ViT', images_encoder='dinov2_vit_base_p14'),
25 |     head_2d=dict(
26 |         type='UpsampleHead', in_channels=768, out_channels=64,
27 |         scale_factor=14),
28 |     superpixel_size=150,
29 |     temperature=0.07)
30 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional
 2 | 
 3 | from torch import Tensor, nn
 4 | 
 5 | 
 6 | class Mlp(nn.Module):
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         in_features: int,
11 |         hidden_features: Optional[int] = None,
12 |         out_features: Optional[int] = None,
13 |         act_layer: Callable[..., nn.Module] = nn.GELU,
14 |         drop: float = 0.0,
15 |         bias: bool = True,
16 |     ) -> None:
17 |         super().__init__()
18 |         out_features = out_features or in_features
19 |         hidden_features = hidden_features or in_features
20 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
21 |         self.act = act_layer()
22 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
23 |         self.drop = nn.Dropout(drop)
24 | 
25 |     def forward(self, x: Tensor) -> Tensor:
26 |         x = self.fc1(x)
27 |         x = self.act(x)
28 |         x = self.drop(x)
29 |         x = self.fc2(x)
30 |         x = self.drop(x)
31 |         return x
32 | 


--------------------------------------------------------------------------------
/docs/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Prerequisties
 4 | 
 5 | This codebase is tested with `torch==1.12.1`, `mmengine==0.10.4`, `mmcv==2.2.0`, `mmdet==3.3.0`, and `mmdet3d==1.4.0`, with `CUDA 11.3`.
 6 | 
 7 | **Step 1.** Create a conda environment and activate it.
 8 | 
 9 | ```bash
10 | conda create --name limoe python==3.8 -y
11 | conda activate limoe
12 | ```
13 | 
14 | **Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/).
15 | 
16 | ```bash
17 | conda install pytorch torchvision -c pytorch
18 | ```
19 | 
20 | **Step 3.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv), [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d) using [MIM](https://github.com/open-mmlab/mim).
21 | 
22 | ```bash
23 | pip install -U openmim
24 | mim install mmengine
25 | mim install mmcv
26 | mim install mmdet
27 | mim install mmdet3d
28 | ```
29 | 
30 | Optionally, you can also install the above projects from the source, e.g.:
31 | 
32 | ```bash
33 | git clone https://github.com/open-mmlab/mmdetection3d
34 | cd mmdetection3d
35 | pip install -v -e .
36 | ```
37 | 
38 | Meanwhile, you also need to install [`nuScenes-devkit`](https://github.com/nutonomy/nuscenes-devkit).
39 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/PyCQA/flake8
 3 |     rev: 5.0.4
 4 |     hooks:
 5 |       - id: flake8
 6 |   - repo: https://github.com/PyCQA/isort
 7 |     rev: 5.11.5
 8 |     hooks:
 9 |       - id: isort
10 |   - repo: https://github.com/pre-commit/mirrors-yapf
11 |     rev: v0.32.0
12 |     hooks:
13 |       - id: yapf
14 |   - repo: https://github.com/pre-commit/pre-commit-hooks
15 |     rev: v4.3.0
16 |     hooks:
17 |       - id: trailing-whitespace
18 |       - id: check-yaml
19 |       - id: end-of-file-fixer
20 |       - id: requirements-txt-fixer
21 |       - id: double-quote-string-fixer
22 |       - id: check-merge-conflict
23 |       - id: fix-encoding-pragma
24 |         args: ["--remove"]
25 |       - id: mixed-line-ending
26 |         args: ["--fix=lf"]
27 |   - repo: https://github.com/codespell-project/codespell
28 |     rev: v2.2.1
29 |     hooks:
30 |       - id: codespell
31 |   - repo: https://github.com/executablebooks/mdformat
32 |     rev: 0.7.9
33 |     hooks:
34 |       - id: mdformat
35 |         args: [ "--number" ]
36 |         additional_dependencies:
37 |           - mdformat-openmmlab
38 |           - mdformat_frontmatter
39 |           - linkify-it-py
40 |   - repo: https://github.com/myint/docformatter
41 |     rev: v1.3.1
42 |     hooks:
43 |       - id: docformatter
44 |         args: ["--in-place", "--wrap-descriptions", "79"]
45 | 


--------------------------------------------------------------------------------
/limoe/datasets/transforms/formatting.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence
 2 | 
 3 | import torch
 4 | from mmcv.transforms import BaseTransform
 5 | from mmdet3d.registry import TRANSFORMS
 6 | from mmdet3d.structures import Det3DDataSample, PointData
 7 | from mmdet3d.structures.points import BasePoints
 8 | 
 9 | 
10 | @TRANSFORMS.register_module()
11 | class LiMoEInputs(BaseTransform):
12 | 
13 |     def __init__(self, keys: Sequence[str] = None):
14 |         self.keys = keys
15 | 
16 |     def transform(self, results: dict) -> dict:
17 |         if 'points' in results:
18 |             if isinstance(results['points'], BasePoints):
19 |                 results['points'] = results['points'].tensor
20 | 
21 |         if 'pairing_points' in results:
22 |             results['pairing_points'] = torch.tensor(results['pairing_points'])
23 | 
24 |         if 'pairing_images' in results:
25 |             results['pairing_images'] = torch.tensor(results['pairing_images'])
26 | 
27 |         data_sample = Det3DDataSample()
28 |         gt_pts_seg = PointData()
29 | 
30 |         inputs = {}
31 |         for key in self.keys:
32 |             if key in ('points', 'imgs'):
33 |                 inputs[key] = results[key]
34 |             elif key in ('pairing_points', 'pairing_images', 'superpixels',
35 |                          'pts_semantic_mask'):
36 |                 gt_pts_seg[key] = results[key]
37 | 
38 |         data_sample.gt_pts_seg = gt_pts_seg
39 | 
40 |         packed_results = dict()
41 |         packed_results['data_samples'] = data_sample
42 |         packed_results['inputs'] = inputs
43 | 
44 |         return packed_results
45 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional
 2 | 
 3 | import torch.nn.functional as F
 4 | from torch import Tensor, nn
 5 | 
 6 | 
 7 | class SwiGLUFFN(nn.Module):
 8 | 
 9 |     def __init__(
10 |         self,
11 |         in_features: int,
12 |         hidden_features: Optional[int] = None,
13 |         out_features: Optional[int] = None,
14 |         act_layer: Callable[..., nn.Module] = None,
15 |         drop: float = 0.0,
16 |         bias: bool = True,
17 |     ) -> None:
18 |         super().__init__()
19 |         out_features = out_features or in_features
20 |         hidden_features = hidden_features or in_features
21 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
22 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
23 | 
24 |     def forward(self, x: Tensor) -> Tensor:
25 |         x12 = self.w12(x)
26 |         x1, x2 = x12.chunk(2, dim=-1)
27 |         hidden = F.silu(x1) * x2
28 |         return self.w3(hidden)
29 | 
30 | 
31 | try:
32 |     from xformers.ops import SwiGLU
33 | 
34 |     XFORMERS_AVAILABLE = True
35 | except ImportError:
36 |     SwiGLU = SwiGLUFFN
37 |     XFORMERS_AVAILABLE = False
38 | 
39 | 
40 | class SwiGLUFFNFused(SwiGLU):
41 | 
42 |     def __init__(
43 |         self,
44 |         in_features: int,
45 |         hidden_features: Optional[int] = None,
46 |         out_features: Optional[int] = None,
47 |         act_layer: Callable[..., nn.Module] = None,
48 |         drop: float = 0.0,
49 |         bias: bool = True,
50 |     ) -> None:
51 |         out_features = out_features or in_features
52 |         hidden_features = hidden_features or in_features
53 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
54 |         super().__init__(
55 |             in_features=in_features,
56 |             hidden_features=hidden_features,
57 |             out_features=out_features,
58 |             bias=bias,
59 |         )
60 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/vit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from mmdet3d.registry import MODELS
 4 | from torch import Tensor
 5 | 
 6 | from .dinov2 import dinov2_vision_transformer as dinov2_vit
 7 | 
 8 | DINOv2_MODELS = {
 9 |     'dinov2_vit_small_p14': ('dinov2_vits14', 14, 384),
10 |     'dinov2_vit_base_p14': ('dinov2_vitb14', 14, 768),
11 |     'dinov2_vit_large_p14': ('dinov2_vitl14', 14, 1024)
12 | }
13 | 
14 | 
15 | @MODELS.register_module()
16 | class ViT(nn.Module):
17 | 
18 |     def __init__(self,
19 |                  images_encoder: str,
20 |                  feat: str = 'x_pre_norm',
21 |                  height: int = 224,
22 |                  width: int = 448) -> None:
23 |         super(ViT, self).__init__()
24 | 
25 |         # ViT parameters
26 |         model_name, patch_size, embed_dim = DINOv2_MODELS.get(images_encoder)
27 |         self.patch_size = patch_size
28 |         self.embed_dim = embed_dim
29 |         self.which_feature = feat
30 | 
31 |         # Compute feature size
32 |         assert (height % self.patch_size) == 0
33 |         assert (width % self.patch_size) == 0
34 |         self.f_height = height // self.patch_size
35 |         self.f_width = width // self.patch_size
36 | 
37 |         # Load ViT
38 |         self.encoder = dinov2_vit.__dict__[model_name](
39 |             patch_size=patch_size, pretrained=True)
40 | 
41 |         # Teacher must stay frozen
42 |         for param in self.encoder.parameters():
43 |             param.requires_grad = False
44 |         self.encoder.eval()
45 | 
46 |     def forward(self, x: Tensor) -> Tensor:
47 | 
48 |         # Go through frozen encoder
49 |         with torch.no_grad():
50 |             batch_size = x.shape[0]
51 | 
52 |             output = self.encoder.forward_get_last_n(x)
53 |             feat = output[self.which_feature]
54 |             x = torch.cat(feat, dim=2)
55 | 
56 |             # Remove the CLS token and reshape the patch token features.
57 |             x = (
58 |                 x[:, 1:, :].transpose(1, 2).view(batch_size, self.embed_dim,
59 |                                                  self.f_height, self.f_width))
60 | 
61 |         return x
62 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/dino_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.init import trunc_normal_
 4 | from torch.nn.utils import weight_norm
 5 | 
 6 | 
 7 | class DINOHead(nn.Module):
 8 | 
 9 |     def __init__(
10 |         self,
11 |         in_dim,
12 |         out_dim,
13 |         use_bn=False,
14 |         nlayers=3,
15 |         hidden_dim=2048,
16 |         bottleneck_dim=256,
17 |         mlp_bias=True,
18 |     ):
19 |         super().__init__()
20 |         nlayers = max(nlayers, 1)
21 |         self.mlp = _build_mlp(
22 |             nlayers,
23 |             in_dim,
24 |             bottleneck_dim,
25 |             hidden_dim=hidden_dim,
26 |             use_bn=use_bn,
27 |             bias=mlp_bias,
28 |         )
29 |         self.apply(self._init_weights)
30 |         self.last_layer = weight_norm(
31 |             nn.Linear(bottleneck_dim, out_dim, bias=False))
32 |         self.last_layer.weight_g.data.fill_(1)
33 | 
34 |     def _init_weights(self, m):
35 |         if isinstance(m, nn.Linear):
36 |             trunc_normal_(m.weight, std=0.02)
37 |             if isinstance(m, nn.Linear) and m.bias is not None:
38 |                 nn.init.constant_(m.bias, 0)
39 | 
40 |     def forward(self, x):
41 |         x = self.mlp(x)
42 |         eps = 1e-6 if x.dtype == torch.float16 else 1e-12
43 |         x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
44 |         x = self.last_layer(x)
45 |         return x
46 | 
47 | 
48 | def _build_mlp(nlayers,
49 |                in_dim,
50 |                bottleneck_dim,
51 |                hidden_dim=None,
52 |                use_bn=False,
53 |                bias=True):
54 |     if nlayers == 1:
55 |         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
56 |     else:
57 |         layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
58 |         if use_bn:
59 |             layers.append(nn.BatchNorm1d(hidden_dim))
60 |         layers.append(nn.GELU())
61 |         for _ in range(nlayers - 2):
62 |             layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
63 |             if use_bn:
64 |                 layers.append(nn.BatchNorm1d(hidden_dim))
65 |             layers.append(nn.GELU())
66 |         layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
67 |         return nn.Sequential(*layers)
68 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/attention.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from torch import Tensor, nn
 4 | 
 5 | logger = logging.getLogger('dinov2')
 6 | 
 7 | try:
 8 |     from xformers.ops import memory_efficient_attention, unbind
 9 | 
10 |     XFORMERS_AVAILABLE = True
11 | except ImportError:
12 |     logger.warning('xFormers not available')
13 |     XFORMERS_AVAILABLE = False
14 | 
15 | 
16 | class Attention(nn.Module):
17 | 
18 |     def __init__(
19 |         self,
20 |         dim: int,
21 |         num_heads: int = 8,
22 |         qkv_bias: bool = False,
23 |         proj_bias: bool = True,
24 |         attn_drop: float = 0.0,
25 |         proj_drop: float = 0.0,
26 |     ) -> None:
27 |         super().__init__()
28 |         self.num_heads = num_heads
29 |         head_dim = dim // num_heads
30 |         self.scale = head_dim**-0.5
31 | 
32 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
33 |         self.attn_drop = nn.Dropout(attn_drop)
34 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
35 |         self.proj_drop = nn.Dropout(proj_drop)
36 | 
37 |     def forward(self, x: Tensor) -> Tensor:
38 |         B, N, C = x.shape
39 |         qkv = (
40 |             self.qkv(x).reshape(B, N, 3, self.num_heads,
41 |                                 C // self.num_heads).permute(2, 0, 3, 1, 4))
42 | 
43 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
44 |         attn = q @ k.transpose(-2, -1)
45 | 
46 |         attn = attn.softmax(dim=-1)
47 |         attn = self.attn_drop(attn)
48 | 
49 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
50 |         x = self.proj(x)
51 |         x = self.proj_drop(x)
52 |         return x
53 | 
54 | 
55 | class MemEffAttention(Attention):
56 | 
57 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
58 |         if not XFORMERS_AVAILABLE:
59 |             assert attn_bias is None, \
60 |                 'xFormers is required for nested tensors usage'
61 |             return super().forward(x)
62 | 
63 |         B, N, C = x.shape
64 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
65 | 
66 |         q, k, v = unbind(qkv, 2)
67 | 
68 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
69 |         x = x.reshape([B, N, C])
70 | 
71 |         x = self.proj(x)
72 |         x = self.proj_drop(x)
73 |         return x
74 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional, Tuple, Union
 2 | 
 3 | import torch.nn as nn
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | def make_2tuple(x):
 8 |     if isinstance(x, tuple):
 9 |         assert len(x) == 2
10 |         return x
11 | 
12 |     assert isinstance(x, int)
13 |     return (x, x)
14 | 
15 | 
16 | class PatchEmbed(nn.Module):
17 |     """2D image to patch embedding: (B,C,H,W) -> (B,N,D)
18 | 
19 |     Args:
20 |         img_size: Image size.
21 |         patch_size: Patch token size.
22 |         in_chans: Number of input image channels.
23 |         embed_dim: Number of linear projection output channels.
24 |         norm_layer: Normalization layer.
25 |     """
26 | 
27 |     def __init__(
28 |         self,
29 |         img_size: Union[int, Tuple[int, int]] = 224,
30 |         patch_size: Union[int, Tuple[int, int]] = 16,
31 |         in_chans: int = 3,
32 |         embed_dim: int = 768,
33 |         norm_layer: Optional[Callable] = None,
34 |         flatten_embedding: bool = True,
35 |     ) -> None:
36 |         super().__init__()
37 | 
38 |         image_HW = make_2tuple(img_size)
39 |         patch_HW = make_2tuple(patch_size)
40 |         patch_grid_size = (
41 |             image_HW[0] // patch_HW[0],
42 |             image_HW[1] // patch_HW[1],
43 |         )
44 | 
45 |         self.img_size = image_HW
46 |         self.patch_size = patch_HW
47 |         self.patches_resolution = patch_grid_size
48 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
49 | 
50 |         self.in_chans = in_chans
51 |         self.embed_dim = embed_dim
52 | 
53 |         self.flatten_embedding = flatten_embedding
54 | 
55 |         self.proj = nn.Conv2d(
56 |             in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
57 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
58 | 
59 |     def forward(self, x: Tensor) -> Tensor:
60 |         _, _, H, W = x.shape
61 |         patch_H, patch_W = self.patch_size
62 | 
63 |         assert (
64 |             H % patch_H == 0
65 |         ), f'Input image height {H} is not a multiple of patch height {patch_H}'  # noqa: E501
66 |         assert (
67 |             W % patch_W == 0
68 |         ), f'Input image width {W} is not a multiple of patch width: {patch_W}'
69 | 
70 |         x = self.proj(x)  # B C H W
71 |         H, W = x.size(2), x.size(3)
72 |         x = x.flatten(2).transpose(1, 2)  # B HW C
73 |         x = self.norm(x)
74 |         if not self.flatten_embedding:
75 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
76 |         return x
77 | 
78 |     def flops(self) -> float:
79 |         Ho, Wo = self.patches_resolution
80 |         flops = (
81 |             Ho * Wo * self.embed_dim * self.in_chans *
82 |             (self.patch_size[0] * self.patch_size[1]))
83 |         if self.norm is not None:
84 |             flops += Ho * Wo * self.embed_dim
85 |         return flops
86 | 


--------------------------------------------------------------------------------
/limoe/models/seg_heads/linear_head.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from mmdet3d.models import Base3DDecodeHead
 6 | from mmdet3d.registry import MODELS
 7 | from mmdet3d.structures.det3d_data_sample import SampleList
 8 | from mmdet3d.utils import ConfigType, OptConfigType
 9 | from torch import Tensor
10 | 
11 | 
12 | @MODELS.register_module()
13 | class LinearHead(Base3DDecodeHead):
14 | 
15 |     def __init__(self, loss_lovasz: OptConfigType = None, **kwargs) -> None:
16 |         super(LinearHead, self).__init__(**kwargs)
17 | 
18 |         if loss_lovasz is not None:
19 |             self.loss_lovasz = MODELS.build(loss_lovasz)
20 |         else:
21 |             self.loss_lovasz = None
22 | 
23 |     def build_conv_seg(self, channels: int, num_classes: int,
24 |                        kernel_size: int) -> nn.Module:
25 |         return nn.Linear(channels, num_classes)
26 | 
27 |     def forward(self, feat_dict: dict) -> dict:
28 |         logits = self.cls_seg(feat_dict['voxel_feats'])
29 |         feat_dict['logits'] = logits
30 |         return feat_dict
31 | 
32 |     def loss_by_feat(self, feat_dict: dict,
33 |                      batch_data_samples: SampleList) -> Dict[str, Tensor]:
34 |         voxel_semantic_segs = []
35 |         voxel_inds = feat_dict['voxel_inds']
36 |         for batch_idx, data_sample in enumerate(batch_data_samples):
37 |             pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
38 |             voxel_semantic_mask = pts_semantic_mask[voxel_inds[batch_idx]]
39 |             voxel_semantic_segs.append(voxel_semantic_mask)
40 |         seg_label = torch.cat(voxel_semantic_segs)
41 |         seg_logit_feat = feat_dict['logits']
42 |         loss = dict()
43 |         loss['loss_ce'] = self.loss_decode(
44 |             seg_logit_feat, seg_label, ignore_index=self.ignore_index)
45 |         if self.loss_lovasz is not None:
46 |             loss['loss_lovasz'] = self.loss_lovasz(
47 |                 seg_logit_feat, seg_label, ignore_index=self.ignore_index)
48 |         return loss
49 | 
50 |     def predict(self, feat_dict: dict, batch_input_metas: List[dict],
51 |                 test_cfg: ConfigType) -> List[Tensor]:
52 |         feat_dict = self.forward(feat_dict)
53 |         seg_pred_list = self.predict_by_feat(feat_dict, batch_input_metas)
54 |         return seg_pred_list
55 | 
56 |     def predict_by_feat(self, feat_dict: dict,
57 |                         batch_input_metas: List[dict]) -> List[Tensor]:
58 |         seg_logits = feat_dict['logits']
59 | 
60 |         seg_pred_list = []
61 |         coors = feat_dict['coors']
62 |         for batch_idx in range(len(batch_input_metas)):
63 |             batch_mask = coors[:, -1] == batch_idx
64 |             seg_logits_sample = seg_logits[batch_mask]
65 |             point2voxel_map = feat_dict['point2voxel_maps'][batch_idx].long()
66 |             point_seg_predicts = seg_logits_sample[point2voxel_map]
67 |             seg_pred_list.append(point_seg_predicts)
68 | 
69 |         return seg_pred_list
70 | 


--------------------------------------------------------------------------------
/configs/_base_/datasets/nuscenes_pretrain.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesSegDataset'
  2 | data_root = 'data/nuscenes/'
  3 | class_names = [
  4 |     'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
  5 |     'pedestrian', 'traffic_cone', 'trailer', 'truck', 'driveable_surface',
  6 |     'other_flat', 'sidewalk', 'terrain', 'manmade', 'vegetation'
  7 | ]
  8 | labels_map = {
  9 |     0: 16,
 10 |     1: 16,
 11 |     2: 6,
 12 |     3: 6,
 13 |     4: 6,
 14 |     5: 16,
 15 |     6: 6,
 16 |     7: 16,
 17 |     8: 16,
 18 |     9: 0,
 19 |     10: 16,
 20 |     11: 16,
 21 |     12: 7,
 22 |     13: 16,
 23 |     14: 1,
 24 |     15: 2,
 25 |     16: 2,
 26 |     17: 3,
 27 |     18: 4,
 28 |     19: 16,
 29 |     20: 16,
 30 |     21: 5,
 31 |     22: 8,
 32 |     23: 9,
 33 |     24: 10,
 34 |     25: 11,
 35 |     26: 12,
 36 |     27: 13,
 37 |     28: 14,
 38 |     29: 16,
 39 |     30: 15,
 40 |     31: 16
 41 | }
 42 | 
 43 | metainfo = dict(
 44 |     classes=class_names, seg_label_mapping=labels_map, max_label=31)
 45 | input_modality = dict(use_lidar=True, use_camera=True)
 46 | 
 47 | data_prefix = dict(
 48 |     pts='samples/LIDAR_TOP',
 49 |     CAM_FRONT='samples/CAM_FRONT',
 50 |     CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
 51 |     CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
 52 |     CAM_BACK='samples/CAM_BACK',
 53 |     CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
 54 |     CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
 55 |     pts_semantic_mask='lidarseg/v1.0-trainval')
 56 | 
 57 | train_pipeline = [
 58 |     dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=4),
 59 |     dict(
 60 |         type='LoadMultiModalityData',
 61 |         superpixel_root='data/superpixels/slic/',
 62 |         num_cameras=6),
 63 |     dict(
 64 |         type='RandomFlip3D',
 65 |         sync_2d=False,
 66 |         flip_ratio_bev_horizontal=0.5,
 67 |         flip_ratio_bev_vertical=0.55),
 68 |     dict(
 69 |         type='GlobalRotScaleTrans',
 70 |         rot_range=[0, 3.14159265359],
 71 |         scale_ratio_range=[0.95, 1.05]),
 72 |     dict(
 73 |         type='ResizedCrop',
 74 |         image_crop_size=[224, 448],
 75 |         image_crop_ratio=[1.5555555555555556, 1.8888888888888888],
 76 |         crop_center=True),
 77 |     dict(type='FlipHorizontal'),
 78 |     dict(
 79 |         type='LiMoEInputs',
 80 |         keys=[
 81 |             'points', 'imgs', 'pairing_points', 'pairing_images', 'superpixels'
 82 |         ])
 83 | ]
 84 | 
 85 | train_dataloader = dict(
 86 |     batch_size=4,
 87 |     num_workers=4,
 88 |     persistent_workers=True,
 89 |     sampler=dict(type='DefaultSampler', shuffle=True),
 90 |     dataset=dict(
 91 |         type='NuScenesSegDataset',
 92 |         data_root=data_root,
 93 |         ann_file='superflow_nus_info.pkl',
 94 |         data_prefix=data_prefix,
 95 |         pipeline=train_pipeline,
 96 |         metainfo=metainfo,
 97 |         modality=input_modality,
 98 |         ignore_index=16))
 99 | 
100 | vis_backends = [dict(type='LocalVisBackend')]
101 | visualizer = dict(
102 |     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
103 | 


--------------------------------------------------------------------------------
/limoe/datasets/nuscenes_dataset.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | from typing import Callable, List, Optional, Union
  3 | 
  4 | import numpy as np
  5 | from mmdet3d.datasets import Seg3DDataset
  6 | from mmdet3d.registry import DATASETS
  7 | 
  8 | 
  9 | @DATASETS.register_module()
 10 | class NuScenesSegDataset(Seg3DDataset):
 11 | 
 12 |     METAINFO = {
 13 |         'classes': ('barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 14 |                     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer',
 15 |                     'truck', 'driveable_surface', 'other_flat', 'sidewalk',
 16 |                     'terrain', 'manmade', 'vegetation'),
 17 |         'palette': [[255, 120, 50], [255, 192, 203], [255, 255, 0],
 18 |                     [0, 150, 245], [0, 255, 255], [255, 127, 0], [255, 0, 0],
 19 |                     [255, 240, 150], [135, 60, 0], [160, 32,
 20 |                                                     240], [255, 0, 255],
 21 |                     [139, 137, 137], [75, 0, 75], [150, 240, 80],
 22 |                     [230, 230, 250], [0, 175, 0]],
 23 |         'seg_valid_class_ids':
 24 |         tuple(range(16)),
 25 |         'seg_all_class_ids':
 26 |         tuple(range(16)),
 27 |     }
 28 | 
 29 |     def __init__(self,
 30 |                  data_root: Optional[str] = None,
 31 |                  ann_file: str = '',
 32 |                  metainfo: Optional[dict] = None,
 33 |                  data_prefix: dict = dict(
 34 |                      pts='',
 35 |                      img='',
 36 |                      pts_instance_mask='',
 37 |                      pts_semantic_mask=''),
 38 |                  pipeline: List[Union[dict, Callable]] = [],
 39 |                  modality: dict = dict(use_lidar=True, use_camera=False),
 40 |                  ignore_index: Optional[int] = None,
 41 |                  scene_idxs: Optional[Union[str, np.ndarray]] = None,
 42 |                  test_mode: bool = False,
 43 |                  serialize_data: bool = True,
 44 |                  **kwargs) -> None:
 45 |         super(NuScenesSegDataset, self).__init__(
 46 |             data_root=data_root,
 47 |             ann_file=ann_file,
 48 |             metainfo=metainfo,
 49 |             data_prefix=data_prefix,
 50 |             pipeline=pipeline,
 51 |             modality=modality,
 52 |             ignore_index=ignore_index,
 53 |             scene_idxs=scene_idxs,
 54 |             test_mode=test_mode,
 55 |             serialize_data=serialize_data,
 56 |             **kwargs)
 57 | 
 58 |     def get_seg_label_mapping(self, metainfo: dict) -> np.ndarray:
 59 |         seg_label_mapping = np.zeros(metainfo['max_label'] + 1, dtype=np.int64)
 60 |         for idx in metainfo['seg_label_mapping']:
 61 |             seg_label_mapping[idx] = metainfo['seg_label_mapping'][idx]
 62 |         return seg_label_mapping
 63 | 
 64 |     def parse_data_info(self, info: dict) -> dict:
 65 |         if self.modality['use_lidar']:
 66 |             info['lidar_points']['lidar_path'] = \
 67 |                 osp.join(
 68 |                     self.data_prefix.get('pts', ''),
 69 |                     info['lidar_points']['lidar_path'])
 70 |             if 'num_pts_feats' in info['lidar_points']:
 71 |                 info['num_pts_feats'] = info['lidar_points']['num_pts_feats']
 72 |             info['lidar_path'] = info['lidar_points']['lidar_path']
 73 | 
 74 |         if self.modality['use_camera']:
 75 |             for cam_id, img_info in info['images'].items():
 76 |                 if 'img_path' in img_info:
 77 |                     if cam_id in self.data_prefix:
 78 |                         cam_prefix = self.data_prefix[cam_id]
 79 |                     else:
 80 |                         cam_prefix = self.data_prefix.get('img', '')
 81 |                     img_info['img_path'] = osp.join(cam_prefix,
 82 |                                                     img_info['img_path'])
 83 | 
 84 |         if 'pts_instance_mask_path' in info:
 85 |             info['pts_instance_mask_path'] = \
 86 |                 osp.join(self.data_prefix.get('pts_instance_mask', ''),
 87 |                          info['pts_instance_mask_path'])
 88 | 
 89 |         if 'pts_semantic_mask_path' in info:
 90 |             info['pts_semantic_mask_path'] = \
 91 |                 osp.join(self.data_prefix.get('pts_semantic_mask', ''),
 92 |                          info['pts_semantic_mask_path'])
 93 | 
 94 |         info['seg_label_mapping'] = self.seg_label_mapping
 95 | 
 96 |         if self.test_mode and self.load_eval_anns:
 97 |             info['eval_ann_info'] = dict()
 98 | 
 99 |         return info
100 | 


--------------------------------------------------------------------------------
/limoe/datasets/transforms/loading.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os.path as osp
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from mmcv.transforms import BaseTransform
  7 | from mmdet3d.registry import TRANSFORMS
  8 | from nuscenes.utils.data_classes import LidarPointCloud
  9 | from nuscenes.utils.geometry_utils import view_points
 10 | from PIL import Image
 11 | 
 12 | 
 13 | @TRANSFORMS.register_module()
 14 | class LoadMultiModalityData(BaseTransform):
 15 | 
 16 |     def __init__(self,
 17 |                  superpixel_root: str,
 18 |                  num_cameras: int = 6,
 19 |                  min_dist: float = 1.0) -> None:
 20 |         self.superpixel_root = superpixel_root
 21 |         self.min_dist = min_dist
 22 |         self.num_cameras = num_cameras
 23 | 
 24 |     def transform(self, results: dict) -> dict:
 25 |         points = results['points'].numpy()
 26 |         pc_original = LidarPointCloud(points.T)
 27 |         pairing_points = np.empty(0, dtype=np.int64)
 28 |         pairing_images = np.empty((0, 3), dtype=np.int64)
 29 | 
 30 |         images = []
 31 |         superpixels = []
 32 | 
 33 |         camera_list = [
 34 |             'CAM_FRONT',
 35 |             'CAM_FRONT_LEFT',
 36 |             'CAM_FRONT_RIGHT',
 37 |             'CAM_BACK',
 38 |             'CAM_BACK_LEFT',
 39 |             'CAM_BACK_RIGHT',
 40 |         ]
 41 |         camera_list = np.random.choice(
 42 |             camera_list, size=self.num_cameras, replace=False)
 43 |         np.random.shuffle(camera_list)
 44 | 
 45 |         for i, cam in enumerate(camera_list):
 46 |             # load point clouds
 47 |             pc = copy.deepcopy(pc_original)
 48 | 
 49 |             # load camera images
 50 |             img = np.array(Image.open(results['images'][cam]['img_path']))
 51 | 
 52 |             # load superpixels
 53 |             sp_path = osp.join(
 54 |                 self.superpixel_root,
 55 |                 results['images'][cam]['sample_data_token'] + '.png')
 56 |             sp = np.array(Image.open(sp_path))
 57 | 
 58 |             # transform the point cloud to the vehicle frame for the
 59 |             # timestamp of the sweep.
 60 |             pc.rotate(results['lidar2ego_rotation'])
 61 |             pc.translate(results['lidar2ego_translation'])
 62 | 
 63 |             # transform from ego to the global frame.
 64 |             pc.rotate(results['ego2global_rotation'])
 65 |             pc.translate(results['ego2global_translation'])
 66 | 
 67 |             # transform from global frame to the ego vehicle frame for the
 68 |             # timestamp of the image.
 69 |             pc.translate(-results['images'][cam]['ego2global_translation'])
 70 |             pc.rotate(results['images'][cam]['ego2global_rotation'].T)
 71 | 
 72 |             # transform from ego to the camera.
 73 |             pc.translate(-results['images'][cam]['sensor2ego_translation'])
 74 |             pc.rotate(results['images'][cam]['sensor2ego_rotation'].T)
 75 | 
 76 |             # camera frame z axis points away from the camera
 77 |             depths = pc.points[2, :]
 78 | 
 79 |             # matrix multiplication with camera-matrix + renormalization.
 80 |             points = view_points(
 81 |                 pc.points[:3, :],
 82 |                 results['images'][cam]['cam_intrinsic'],
 83 |                 normalize=True)
 84 | 
 85 |             # Remove points that are either outside or behind the camera.
 86 |             # Also make sure points are at least 1m in front of the camera to
 87 |             # avoid seeing the lidar points on the camera.
 88 |             points = points[:2].T
 89 |             mask = np.ones(depths.shape[0], dtype=bool)
 90 |             mask = np.logical_and(mask, depths > self.min_dist)
 91 |             mask = np.logical_and(mask, points[:, 0] > 0)
 92 |             mask = np.logical_and(mask, points[:, 0] < img.shape[1] - 1)
 93 |             mask = np.logical_and(mask, points[:, 1] > 0)
 94 |             mask = np.logical_and(mask, points[:, 1] < img.shape[0] - 1)
 95 | 
 96 |             matching_points = np.where(mask)[0]
 97 |             matching_pixels = np.round(
 98 |                 np.flip(points[matching_points], axis=1)).astype(np.int64)
 99 | 
100 |             images.append(img / 255.)
101 |             superpixels.append(sp)
102 |             pairing_points = np.concatenate((pairing_points, matching_points))
103 |             pairing_images = np.concatenate(
104 |                 (pairing_images,
105 |                  np.concatenate((np.ones(
106 |                      (matching_pixels.shape[0], 1), dtype=np.int64) * i,
107 |                                  matching_pixels),
108 |                                 axis=1)))
109 | 
110 |         results['imgs'] = torch.tensor(
111 |             np.array(images, dtype=np.float32).transpose(0, 3, 1, 2))
112 |         results['superpixels'] = torch.tensor(np.stack(superpixels))
113 |         results['pairing_points'] = pairing_points
114 |         results['pairing_images'] = pairing_images
115 |         return results
116 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import os.path as osp
  5 | 
  6 | from mmdet3d.utils import replace_ceph_backend
  7 | from mmengine.config import Config, DictAction
  8 | from mmengine.logging import print_log
  9 | from mmengine.registry import RUNNERS
 10 | from mmengine.runner import Runner
 11 | 
 12 | 
 13 | def parse_args():
 14 |     parser = argparse.ArgumentParser(description='Train a 3D model')
 15 |     parser.add_argument('config', help='train config file path')
 16 |     parser.add_argument('--work-dir', help='the dir to save logs and models')
 17 |     parser.add_argument(
 18 |         '--amp',
 19 |         action='store_true',
 20 |         default=False,
 21 |         help='enable automatic-mixed-precision training')
 22 |     parser.add_argument(
 23 |         '--auto-scale-lr',
 24 |         action='store_true',
 25 |         help='enable automatically scaling LR.')
 26 |     parser.add_argument(
 27 |         '--resume',
 28 |         nargs='?',
 29 |         type=str,
 30 |         const='auto',
 31 |         help='If specify checkpoint path, resume from it, while if not '
 32 |         'specify, try to auto resume from the latest checkpoint '
 33 |         'in the work directory.')
 34 |     parser.add_argument(
 35 |         '--ceph', action='store_true', help='Use ceph as data storage backend')
 36 |     parser.add_argument(
 37 |         '--cfg-options',
 38 |         nargs='+',
 39 |         action=DictAction,
 40 |         help='override some settings in the used config, the key-value pair '
 41 |         'in xxx=yyy format will be merged into config file. If the value to '
 42 |         'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
 43 |         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
 44 |         'Note that the quotation marks are necessary and that no white space '
 45 |         'is allowed.')
 46 |     parser.add_argument(
 47 |         '--launcher',
 48 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
 49 |         default='none',
 50 |         help='job launcher')
 51 |     # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
 52 |     # will pass the `--local-rank` parameter to `tools/train.py` instead
 53 |     # of `--local_rank`.
 54 |     parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
 55 |     args = parser.parse_args()
 56 |     if 'LOCAL_RANK' not in os.environ:
 57 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 58 |     return args
 59 | 
 60 | 
 61 | def main():
 62 |     args = parse_args()
 63 | 
 64 |     # load config
 65 |     cfg = Config.fromfile(args.config)
 66 | 
 67 |     # TODO: We will unify the ceph support approach with other OpenMMLab repos
 68 |     if args.ceph:
 69 |         cfg = replace_ceph_backend(cfg)
 70 | 
 71 |     cfg.launcher = args.launcher
 72 |     if args.cfg_options is not None:
 73 |         cfg.merge_from_dict(args.cfg_options)
 74 | 
 75 |     # work_dir is determined in this priority: CLI > segment in file > filename
 76 |     if args.work_dir is not None:
 77 |         # update configs according to CLI args if args.work_dir is not None
 78 |         cfg.work_dir = args.work_dir
 79 |     elif cfg.get('work_dir', None) is None:
 80 |         # use config filename as default work_dir if cfg.work_dir is None
 81 |         cfg.work_dir = osp.join('./work_dirs',
 82 |                                 osp.splitext(osp.basename(args.config))[0])
 83 | 
 84 |     # enable automatic-mixed-precision training
 85 |     if args.amp is True:
 86 |         optim_wrapper = cfg.optim_wrapper.type
 87 |         if optim_wrapper == 'AmpOptimWrapper':
 88 |             print_log(
 89 |                 'AMP training is already enabled in your config.',
 90 |                 logger='current',
 91 |                 level=logging.WARNING)
 92 |         else:
 93 |             assert optim_wrapper == 'OptimWrapper', (
 94 |                 '`--amp` is only supported when the optimizer wrapper type is '
 95 |                 f'`OptimWrapper` but got {optim_wrapper}.')
 96 |             cfg.optim_wrapper.type = 'AmpOptimWrapper'
 97 |             cfg.optim_wrapper.loss_scale = 'dynamic'
 98 | 
 99 |     # enable automatically scaling LR
100 |     if args.auto_scale_lr:
101 |         if 'auto_scale_lr' in cfg and \
102 |                 'enable' in cfg.auto_scale_lr and \
103 |                 'base_batch_size' in cfg.auto_scale_lr:
104 |             cfg.auto_scale_lr.enable = True
105 |         else:
106 |             raise RuntimeError('Can not find "auto_scale_lr" or '
107 |                                '"auto_scale_lr.enable" or '
108 |                                '"auto_scale_lr.base_batch_size" in your'
109 |                                ' configuration file.')
110 | 
111 |     # resume is determined in this priority: resume from > auto_resume
112 |     if args.resume == 'auto':
113 |         cfg.resume = True
114 |         cfg.load_from = None
115 |     elif args.resume is not None:
116 |         cfg.resume = True
117 |         cfg.load_from = args.resume
118 | 
119 |     # build the runner from config
120 |     if 'runner_type' not in cfg:
121 |         # build the default runner
122 |         runner = Runner.from_cfg(cfg)
123 |     else:
124 |         # build customized runner from the registry
125 |         # if 'runner_type' is set in the cfg
126 |         runner = RUNNERS.build(cfg)
127 | 
128 |     # start training
129 |     runner.train()
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/limoe/datasets/transforms/transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import Sequence
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from mmcv.transforms import BaseTransform
  7 | from mmdet3d.registry import TRANSFORMS
  8 | from torchvision.transforms import InterpolationMode, RandomResizedCrop
  9 | from torchvision.transforms.functional import hflip, resize, resized_crop
 10 | 
 11 | 
 12 | @TRANSFORMS.register_module()
 13 | class ResizedCrop(BaseTransform):
 14 | 
 15 |     def __init__(self,
 16 |                  image_crop_size: Sequence[int] = (224, 416),
 17 |                  image_crop_range: Sequence[float] = (0.3, 1.0),
 18 |                  image_crop_ratio: Sequence[float] = (14.0 / 9.0, 17.0 / 9.0),
 19 |                  crop_center: bool = False) -> None:
 20 |         self.crop_size = image_crop_size
 21 |         self.crop_range = image_crop_range
 22 |         self.crop_ratio = image_crop_ratio
 23 |         self.crop_center = crop_center
 24 | 
 25 |     def transform(self, results: dict) -> dict:
 26 |         images = results['imgs']
 27 |         superpixels = results['superpixels'].unsqueeze(1)
 28 |         pairing_points = results['pairing_points']
 29 |         pairing_images = results['pairing_images']
 30 | 
 31 |         imgs = torch.empty(
 32 |             (images.shape[0], 3) + tuple(self.crop_size), dtype=torch.float32)
 33 |         sps = torch.empty(
 34 |             (images.shape[0], ) + tuple(self.crop_size), dtype=torch.uint8)
 35 |         pairing_points_out = np.empty(0, dtype=np.int64)
 36 |         pairing_images_out = np.empty((0, 3), dtype=np.int64)
 37 | 
 38 |         if self.crop_center:
 39 |             pairing_points_out = pairing_points
 40 | 
 41 |             _, _, h, w = images.shape
 42 |             for id, img in enumerate(images):
 43 |                 mask = pairing_images[:, 0] == id
 44 |                 p2 = pairing_images[mask]
 45 |                 p2 = np.round(
 46 |                     np.multiply(
 47 |                         p2,
 48 |                         [1.0, self.crop_size[0] / h, self.crop_size[1] / w
 49 |                          ])).astype(np.int64)
 50 |                 imgs[id] = resize(img, self.crop_size,
 51 |                                   InterpolationMode.BILINEAR)
 52 |                 sps[id] = resize(superpixels[id], self.crop_size,
 53 |                                  InterpolationMode.NEAREST)
 54 |                 p2[:, 1] = np.clip(0, self.crop_size[0] - 1, p2[:, 1])
 55 |                 p2[:, 2] = np.clip(0, self.crop_size[1] - 1, p2[:, 2])
 56 |                 pairing_images_out = np.concatenate((pairing_images_out, p2))
 57 |         else:
 58 |             for id, img in enumerate(images):
 59 |                 successful = False
 60 |                 mask = pairing_images[:, 0] == id
 61 |                 P1 = pairing_points[mask]
 62 |                 P2 = pairing_images[mask]
 63 |                 while not successful:
 64 |                     i, j, h, w = RandomResizedCrop.get_params(
 65 |                         img, self.crop_range, self.crop_ratio)
 66 |                     p1 = P1.copy()
 67 |                     p2 = P2.copy()
 68 |                     p2 = np.round(
 69 |                         np.multiply(p2 - [0, i, j], [
 70 |                             1.0, self.crop_size[0] / h, self.crop_size[1] / w
 71 |                         ])).astype(np.int64)
 72 |                     valid_indexes_0 = np.logical_and(
 73 |                         p2[:, 1] < self.crop_size[0], p2[:, 1] >= 0)
 74 |                     valid_indexes_1 = np.logical_and(
 75 |                         p2[:, 2] < self.crop_size[1], p2[:, 2] >= 0)
 76 |                     valid_indexes = np.logical_and(valid_indexes_0,
 77 |                                                    valid_indexes_1)
 78 |                     sum_indexes = valid_indexes.sum()
 79 |                     len_indexes = len(valid_indexes)
 80 |                     if sum_indexes > 1024 or sum_indexes / len_indexes > 0.75:
 81 |                         successful = True
 82 |                 imgs[id] = resized_crop(img, i, j, h, w, self.crop_size,
 83 |                                         InterpolationMode.BILINEAR)
 84 |                 sps[id] = resized_crop(superpixels[id], i, j, h, w,
 85 |                                        self.crop_size,
 86 |                                        InterpolationMode.NEAREST)
 87 |                 pairing_points_out = np.concatenate(
 88 |                     (pairing_points_out, p1[valid_indexes]))
 89 |                 pairing_images_out = np.concatenate(
 90 |                     (pairing_images_out, p2[valid_indexes]))
 91 | 
 92 |         results['imgs'] = imgs
 93 |         results['superpixels'] = sps
 94 |         results['pairing_points'] = pairing_points_out
 95 |         results['pairing_images'] = pairing_images_out
 96 |         return results
 97 | 
 98 | 
 99 | @TRANSFORMS.register_module()
100 | class FlipHorizontal(BaseTransform):
101 | 
102 |     def __init__(self, flip_ratio: float = 0.5) -> None:
103 |         self.flip_ratio = flip_ratio
104 | 
105 |     def transform(self, results: dict) -> dict:
106 |         images = results['imgs']
107 |         superpixels = results['superpixels']
108 |         pairing_images = results['pairing_images']
109 | 
110 |         w = images.shape[3]
111 |         for i, img in enumerate(images):
112 |             if random.random() < self.flip_ratio:
113 |                 images[i] = hflip(img)
114 |                 superpixels[i] = hflip(superpixels[i:i + 1])
115 |                 mask = pairing_images[:, 0] == i
116 |                 pairing_images[mask, 2] = w - 1 - pairing_images[mask, 2]
117 | 
118 |         results['imgs'] = images
119 |         results['superpixels'] = superpixels
120 |         results['pairing_images'] = pairing_images
121 |         return results
122 | 


--------------------------------------------------------------------------------
/limoe/models/selfsup/slidr.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from mmdet3d.registry import MODELS
  7 | from mmdet3d.structures.det3d_data_sample import (ForwardResults,
  8 |                                                   OptSampleList, SampleList)
  9 | from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
 10 | from mmengine.model import BaseModel
 11 | from torch import Tensor
 12 | 
 13 | 
 14 | class ContrastiveLoss(nn.Module):
 15 | 
 16 |     def __init__(self, temperature: float) -> None:
 17 |         super(ContrastiveLoss, self).__init__()
 18 |         self.temperature = temperature
 19 |         self.criterion = nn.CrossEntropyLoss()
 20 | 
 21 |     def forward(self, k: Tensor, q: Tensor) -> Tensor:
 22 |         logits = torch.mm(k, q.transpose(1, 0))
 23 |         target = torch.arange(k.shape[0], device=k.device).long()
 24 |         out = torch.div(logits, self.temperature)
 25 |         out = out.contiguous()
 26 |         loss = self.criterion(out, target)
 27 |         return loss
 28 | 
 29 | 
 30 | @MODELS.register_module()
 31 | class SLidR(BaseModel):
 32 | 
 33 |     def __init__(self,
 34 |                  backbone_3d: ConfigType,
 35 |                  head_3d: ConfigType,
 36 |                  backbone_2d: ConfigType,
 37 |                  head_2d: ConfigType,
 38 |                  superpixel_size: int,
 39 |                  temperature: float,
 40 |                  voxel_encoder_3d: OptConfigType = None,
 41 |                  data_preprocessor: OptConfigType = None,
 42 |                  train_cfg: ConfigType = None,
 43 |                  init_cfg: OptMultiConfig = None):
 44 |         super(SLidR, self).__init__(
 45 |             data_preprocessor=data_preprocessor, init_cfg=init_cfg)
 46 | 
 47 |         self.backbone_2d = MODELS.build(backbone_2d)
 48 |         self.head_2d = MODELS.build(head_2d)
 49 | 
 50 |         self.backbone_3d = MODELS.build(backbone_3d)
 51 |         self.head_3d = MODELS.build(head_3d)
 52 |         if voxel_encoder_3d is not None:
 53 |             self.voxel_encoder_3d = MODELS.build(voxel_encoder_3d)
 54 |             self.range = True
 55 |         else:
 56 |             self.voxel_encoder_3d = None
 57 |             self.range = False
 58 | 
 59 |         self.superpixel_size = superpixel_size
 60 |         self.contrastive_loss = ContrastiveLoss(temperature)
 61 |         self.train_cfg = train_cfg
 62 | 
 63 |     def extract_3d_feature(self, feat_dict: dict) -> Tensor:
 64 |         if self.range:
 65 |             feat_dict = self.voxel_encoder_3d(feat_dict)
 66 |         feat_dict = self.backbone_3d(feat_dict)
 67 |         features = self.head_3d(feat_dict)['logits']
 68 |         features = F.normalize(features, p=2, dim=1)
 69 |         return features
 70 | 
 71 |     def extract_2d_feature(self, images: Tensor) -> Tensor:
 72 |         features = self.backbone_2d(images)
 73 |         features = self.head_2d(features)
 74 |         features = F.normalize(features, p=2, dim=1)
 75 |         return features
 76 | 
 77 |     def loss(self, inputs: dict,
 78 |              data_samples: SampleList) -> Dict[str, Tensor]:
 79 | 
 80 |         # forward
 81 |         features_2d = self.extract_2d_feature(inputs['imgs'])
 82 | 
 83 |         feat_dict = inputs['ranges'].copy(
 84 |         ) if self.range else inputs['voxels'].copy()
 85 |         features_3d = self.extract_3d_feature(feat_dict)
 86 | 
 87 |         superpixels = []
 88 |         pairing_images = []
 89 |         pairing_points = []
 90 |         offset = 0
 91 | 
 92 |         if self.range:
 93 |             coors = feat_dict['coors']
 94 |             for i, data_sample in enumerate(data_samples):
 95 |                 superpixel = data_sample.gt_pts_seg.superpixels
 96 |                 pairing_image = data_sample.gt_pts_seg.pairing_images
 97 |                 pairing_image[:, 0] += i * superpixel.shape[0]
 98 |                 pairing_point = data_sample.gt_pts_seg.pairing_points
 99 |                 pairing_point = pairing_point.long() + offset
100 |                 offset += sum(coors[:, 0] == i)
101 | 
102 |                 superpixels.append(superpixel)
103 |                 pairing_images.append(pairing_image)
104 |                 pairing_points.append(pairing_point)
105 | 
106 |         else:
107 |             for i, data_sample in enumerate(data_samples):
108 |                 superpixel = data_sample.gt_pts_seg.superpixels
109 |                 pairing_image = data_sample.gt_pts_seg.pairing_images
110 |                 pairing_image[:, 0] += i * superpixel.shape[0]
111 |                 pairing_point = data_sample.gt_pts_seg.pairing_points
112 |                 inverse_map = feat_dict['point2voxel_maps'][i]
113 |                 pairing_point = inverse_map[pairing_point].long() + offset
114 |                 offset += feat_dict['voxel_inds'][i].shape[0]
115 | 
116 |                 superpixels.append(superpixel)
117 |                 pairing_images.append(pairing_image)
118 |                 pairing_points.append(pairing_point)
119 | 
120 |         superpixels = torch.cat(superpixels)
121 |         pairing_images = torch.cat(pairing_images)
122 |         pairing_points = torch.cat(pairing_points)
123 | 
124 |         superpixels = (
125 |             torch.arange(
126 |                 0,
127 |                 features_2d.shape[0] * self.superpixel_size,
128 |                 self.superpixel_size,
129 |                 device=features_2d.device)[:, None, None] + superpixels)
130 | 
131 |         m = tuple(pairing_images.cpu().T.long())
132 |         superpixels_I = superpixels.flatten()
133 |         idx_P = torch.arange(
134 |             pairing_points.shape[0], device=features_2d.device)
135 |         total_pixels = superpixels_I.shape[0]
136 |         idx_I = torch.arange(total_pixels, device=features_2d.device)
137 | 
138 |         with torch.no_grad():
139 |             one_hot_P = torch.sparse_coo_tensor(
140 |                 torch.stack((superpixels[m], idx_P), dim=0),
141 |                 torch.ones(pairing_points.shape[0], device=features_2d.device),
142 |                 (superpixels.shape[0] * self.superpixel_size,
143 |                  pairing_points.shape[0]))
144 |             one_hot_I = torch.sparse_coo_tensor(
145 |                 torch.stack((superpixels_I, idx_I), dim=0),
146 |                 torch.ones(total_pixels, device=features_2d.device),
147 |                 (superpixels.shape[0] * self.superpixel_size, total_pixels))
148 | 
149 |         k = one_hot_P @ features_3d[pairing_points]
150 |         k = k / (torch.sparse.sum(one_hot_P, 1).to_dense()[:, None] + 1e-6)
151 |         q = one_hot_I @ features_2d.permute(0, 2, 3, 1).flatten(0, 2)
152 |         q = q / (torch.sparse.sum(one_hot_I, 1).to_dense()[:, None] + 1e-6)
153 | 
154 |         mask = torch.where(k[:, 0] != 0)
155 |         valid_k = k[mask]
156 |         valid_q = q[mask]
157 | 
158 |         loss = dict()
159 |         loss['loss_spatial'] = self.contrastive_loss(valid_k, valid_q)
160 | 
161 |         return loss
162 | 
163 |     def forward(self,
164 |                 inputs: dict,
165 |                 data_samples: OptSampleList = None,
166 |                 mode: str = 'tensor') -> ForwardResults:
167 |         if mode == 'loss':
168 |             return self.loss(inputs, data_samples)
169 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/spvcnn.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchsparse
  6 | import torchsparse.nn.functional as F
  7 | from mmdet3d.registry import MODELS
  8 | from mmdet3d.utils import OptMultiConfig
  9 | from torchsparse import PointTensor, SparseTensor
 10 | from torchsparse.nn.utils import get_kernel_offsets
 11 | 
 12 | from .minkunet import MinkUNetBackbone
 13 | 
 14 | 
 15 | @MODELS.register_module()
 16 | class SPVCNNBackbone(MinkUNetBackbone):
 17 | 
 18 |     def __init__(self,
 19 |                  in_channels: int = 4,
 20 |                  base_channels: int = 32,
 21 |                  layers: Sequence[int] = [2, 3, 4, 6, 2, 2, 2, 2],
 22 |                  planes: Sequence[int] = [32, 64, 128, 256, 256, 128, 96, 96],
 23 |                  block_type: str = 'basic',
 24 |                  bn_momentum: float = 0.1,
 25 |                  drop_ratio: float = 0.3,
 26 |                  init_cfg: OptMultiConfig = None) -> None:
 27 |         super(SPVCNNBackbone, self).__init__(
 28 |             in_channels=in_channels,
 29 |             base_channels=base_channels,
 30 |             layers=layers,
 31 |             planes=planes,
 32 |             block_type=block_type,
 33 |             bn_momentum=bn_momentum,
 34 |             init_cfg=init_cfg)
 35 | 
 36 |         self.point_transforms = nn.ModuleList([
 37 |             nn.Sequential(
 38 |                 nn.Linear(base_channels, planes[3]), nn.BatchNorm1d(planes[3]),
 39 |                 nn.ReLU(True)),
 40 |             nn.Sequential(
 41 |                 nn.Linear(planes[3], planes[5]), nn.BatchNorm1d(planes[5]),
 42 |                 nn.ReLU(True)),
 43 |             nn.Sequential(
 44 |                 nn.Linear(planes[5], planes[7]), nn.BatchNorm1d(planes[7]),
 45 |                 nn.ReLU(True)),
 46 |         ])
 47 |         self.dropout = nn.Dropout(drop_ratio, True)
 48 | 
 49 |     def forward(self, feat_dict: dict) -> dict:
 50 |         voxel_features = feat_dict['voxels']
 51 |         coors = feat_dict['coors']
 52 | 
 53 |         # x: SparseTensor z: PointTensor
 54 |         x = SparseTensor(voxel_features, coors)
 55 |         z = PointTensor(x.F, x.C.float())
 56 |         x = initial_voxelize(z)
 57 | 
 58 |         out0 = self.conv0(x)
 59 |         z0 = voxel_to_point(out0, z)
 60 |         out0 = point_to_voxel(out0, z0)
 61 | 
 62 |         out1 = self.conv1(out0)
 63 |         out1 = self.block1(out1)
 64 | 
 65 |         out2 = self.conv2(out1)
 66 |         out2 = self.block2(out2)
 67 | 
 68 |         out3 = self.conv3(out2)
 69 |         out3 = self.block3(out3)
 70 | 
 71 |         out4 = self.conv4(out3)
 72 |         out4 = self.block4(out4)
 73 | 
 74 |         z1 = voxel_to_point(out4, z0, self.point_transforms[0])
 75 |         out4 = point_to_voxel(out4, z1)
 76 |         out4.F = self.dropout(out4.F)
 77 | 
 78 |         out = self.conv5(out4)
 79 |         out = torchsparse.cat((out, out3))
 80 |         out = self.block5(out)
 81 | 
 82 |         out = self.conv6(out)
 83 |         out = torchsparse.cat((out, out2))
 84 |         out = self.block6(out)
 85 | 
 86 |         z2 = voxel_to_point(out, z1, self.point_transforms[1])
 87 |         out = point_to_voxel(out, z2)
 88 |         out.F = self.dropout(out.F)
 89 | 
 90 |         out = self.conv7(out)
 91 |         out = torchsparse.cat((out, out1))
 92 |         out = self.block7(out)
 93 | 
 94 |         out = self.conv8(out)
 95 |         out = torchsparse.cat((out, out0))
 96 |         out = self.block8(out)
 97 | 
 98 |         out = voxel_to_point(out, z2, self.point_transforms[2])
 99 |         feat_dict['voxel_feats'] = out.F
100 |         return feat_dict
101 | 
102 | 
103 | def initial_voxelize(points: PointTensor) -> SparseTensor:
104 |     """Voxelize again based on input PointTensor.
105 | 
106 |     Args:
107 |         points (PointTensor): Input points after voxelization.
108 | 
109 |     Returns:
110 |         SparseTensor: New voxels.
111 |     """
112 |     pc_hash = F.sphash(torch.floor(points.C).int())
113 |     sparse_hash = torch.unique(pc_hash)
114 |     idx_query = F.sphashquery(pc_hash, sparse_hash)
115 |     counts = F.spcount(idx_query.int(), len(sparse_hash))
116 | 
117 |     inserted_coords = F.spvoxelize(torch.floor(points.C), idx_query, counts)
118 |     inserted_coords = torch.round(inserted_coords).int()
119 |     inserted_feat = F.spvoxelize(points.F, idx_query, counts)
120 | 
121 |     new_tensor = SparseTensor(inserted_feat, inserted_coords, 1)
122 |     new_tensor.cmaps.setdefault(new_tensor.stride, new_tensor.coords)
123 |     points.additional_features['idx_query'][1] = idx_query
124 |     points.additional_features['counts'][1] = counts
125 |     return new_tensor
126 | 
127 | 
128 | def voxel_to_point(voxels: SparseTensor,
129 |                    points: PointTensor,
130 |                    point_transform: Optional[nn.Module] = None,
131 |                    nearest: bool = False) -> PointTensor:
132 |     """Fead voxel features to points.
133 | 
134 |     Args:
135 |         voxels (SparseTensor): Input voxels.
136 |         points (PointTensor): Input points.
137 |         nearest (bool): Whether to use nearest neighbor interpolation.
138 |             Defaults to False.
139 | 
140 |     Returns:
141 |         PointTensor: Points with new features.
142 |     """
143 |     if points.idx_query is None or points.weights is None or \
144 |             points.idx_query.get(voxels.s) is None or \
145 |             points.weights.get(voxels.s) is None:
146 |         offsets = get_kernel_offsets(2, voxels.s, 1, device=points.F.device)
147 |         old_hash = F.sphash(
148 |             torch.cat([
149 |                 torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0],
150 |                 points.C[:, -1].int().view(-1, 1)
151 |             ], 1), offsets)
152 |         pc_hash = F.sphash(voxels.C.to(points.F.device))
153 |         idx_query = F.sphashquery(old_hash, pc_hash)
154 |         weights = F.calc_ti_weights(
155 |             points.C, idx_query, scale=voxels.s[0]).transpose(0,
156 |                                                               1).contiguous()
157 |         idx_query = idx_query.transpose(0, 1).contiguous()
158 |         if nearest:
159 |             weights[:, 1:] = 0.
160 |             idx_query[:, 1:] = -1
161 |         new_features = F.spdevoxelize(voxels.F, idx_query, weights)
162 |         new_tensor = PointTensor(
163 |             new_features,
164 |             points.C,
165 |             idx_query=points.idx_query,
166 |             weights=points.weights)
167 |         new_tensor.additional_features = points.additional_features
168 |         new_tensor.idx_query[voxels.s] = idx_query
169 |         new_tensor.weights[voxels.s] = weights
170 |         points.idx_query[voxels.s] = idx_query
171 |         points.weights[voxels.s] = weights
172 |     else:
173 |         new_features = F.spdevoxelize(voxels.F, points.idx_query.get(voxels.s),
174 |                                       points.weights.get(voxels.s))
175 |         new_tensor = PointTensor(
176 |             new_features,
177 |             points.C,
178 |             idx_query=points.idx_query,
179 |             weights=points.weights)
180 |         new_tensor.additional_features = points.additional_features
181 | 
182 |     if point_transform is not None:
183 |         new_tensor.F = new_tensor.F + point_transform(points.F)
184 | 
185 |     return new_tensor
186 | 
187 | 
188 | def point_to_voxel(voxels: SparseTensor, points: PointTensor) -> SparseTensor:
189 |     """Feed point features to voxels.
190 | 
191 |     Args:
192 |         voxels (SparseTensor): Input voxels.
193 |         points (PointTensor): Input points.
194 | 
195 |     Returns:
196 |         SparseTensor: Voxels with new features.
197 |     """
198 |     if points.additional_features is None or \
199 |             points.additional_features.get('idx_query') is None or \
200 |             points.additional_features['idx_query'].get(voxels.s) is None:
201 |         pc_hash = F.sphash(
202 |             torch.cat([
203 |                 torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0],
204 |                 points.C[:, -1].int().view(-1, 1)
205 |             ], 1))
206 |         sparse_hash = F.sphash(voxels.C)
207 |         idx_query = F.sphashquery(pc_hash, sparse_hash)
208 |         counts = F.spcount(idx_query.int(), voxels.C.shape[0])
209 |         points.additional_features['idx_query'][voxels.s] = idx_query
210 |         points.additional_features['counts'][voxels.s] = counts
211 |     else:
212 |         idx_query = points.additional_features['idx_query'][voxels.s]
213 |         counts = points.additional_features['counts'][voxels.s]
214 | 
215 |     inserted_features = F.spvoxelize(points.F, idx_query, counts)
216 |     new_tensor = SparseTensor(inserted_features, voxels.C, voxels.s)
217 |     new_tensor.cmaps = voxels.cmaps
218 |     new_tensor.kmaps = voxels.kmaps
219 | 
220 |     return new_tensor
221 | 


--------------------------------------------------------------------------------
/limoe/models/data_preprocessors/data_preprocessor.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Sequence, Union
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from mmdet3d.registry import MODELS
  7 | from mmdet3d.structures.det3d_data_sample import SampleList
  8 | from mmengine.model import ImgDataPreprocessor
  9 | from mmengine.utils import is_seq_of
 10 | from torch import Tensor
 11 | 
 12 | 
 13 | @MODELS.register_module()
 14 | class LiMoEDataPreprocessor(ImgDataPreprocessor):
 15 | 
 16 |     def __init__(self,
 17 |                  H: int,
 18 |                  W: int,
 19 |                  fov_up: float,
 20 |                  fov_down: float,
 21 |                  ignore_index: int,
 22 |                  voxel_size: Sequence[float],
 23 |                  voxel_type: str = 'cubic',
 24 |                  mean: Optional[Sequence[Union[float, int]]] = None,
 25 |                  std: Optional[Sequence[Union[float, int]]] = None,
 26 |                  pad_size_divisor: int = 1,
 27 |                  pad_value: Union[float, int] = 0,
 28 |                  to_rgb: bool = False,
 29 |                  bgr_to_rgb: bool = False,
 30 |                  rgb_to_bgr: bool = False,
 31 |                  non_blocking: bool = False) -> None:
 32 |         super(LiMoEDataPreprocessor, self).__init__(
 33 |             mean=mean,
 34 |             std=std,
 35 |             pad_size_divisor=pad_size_divisor,
 36 |             pad_value=pad_value,
 37 |             bgr_to_rgb=bgr_to_rgb,
 38 |             rgb_to_bgr=rgb_to_bgr,
 39 |             non_blocking=non_blocking)
 40 |         self._channel_conversion = to_rgb or bgr_to_rgb or rgb_to_bgr
 41 |         self.H = H
 42 |         self.W = W
 43 |         self.fov_up = fov_up / 180 * np.pi
 44 |         self.fov_down = fov_down / 180 * np.pi
 45 |         self.fov = abs(self.fov_down) + abs(self.fov_up)
 46 |         self.ignore_index = ignore_index
 47 |         self.voxel_size = voxel_size
 48 |         self.voxel_type = voxel_type
 49 | 
 50 |     def forward(self, data: dict, training: bool = False) -> dict:
 51 |         data = self.collate_data(data)
 52 |         inputs, data_samples = data['inputs'], data['data_samples']
 53 |         batch_inputs = dict()
 54 | 
 55 |         if 'points' in inputs:
 56 |             batch_inputs['points'] = inputs['points']
 57 |             voxel_dict = self.voxelize(inputs['points'], data_samples)
 58 |             range_dict = self.frustum_region_group(inputs['points'],
 59 |                                                    data_samples)
 60 |             batch_inputs['voxels'] = voxel_dict
 61 |             batch_inputs['ranges'] = range_dict
 62 | 
 63 |         if 'imgs' in inputs:
 64 |             imgs = inputs['imgs']
 65 | 
 66 |             if data_samples is not None:
 67 |                 batch_input_shape = tuple(imgs[0].size()[-2:])
 68 |                 for data_sample in data_samples:
 69 |                     data_sample.set_metainfo(
 70 |                         {'batch_input_shape': batch_input_shape})
 71 | 
 72 |             batch_inputs['imgs'] = imgs
 73 | 
 74 |         return {'inputs': batch_inputs, 'data_samples': data_samples}
 75 | 
 76 |     def preprocess_img(self, _batch_img: Tensor) -> Tensor:
 77 |         if self._channel_conversion:
 78 |             _batch_img = _batch_img[[2, 1, 0], ...]
 79 |         _batch_img = _batch_img.float()
 80 |         if self._enable_normalize:
 81 |             if self.mean.shape[0] == 3:
 82 |                 assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3
 83 |             _batch_img = (_batch_img - self.mean) / self.std
 84 |         return _batch_img
 85 | 
 86 |     def collate_data(self, data: dict) -> dict:
 87 |         data = self.cast_data(data)
 88 | 
 89 |         if 'imgs' in data['inputs']:
 90 |             _batch_imgs = data['inputs']['imgs']
 91 |             assert is_seq_of(_batch_imgs, Tensor)
 92 | 
 93 |             batch_imgs = []
 94 |             for _batch_img in _batch_imgs:
 95 |                 _batch_img = [self.preprocess_img(_img) for _img in _batch_img]
 96 |                 _batch_img = torch.stack(_batch_img, dim=0)
 97 |                 batch_imgs.append(_batch_img)
 98 | 
 99 |             batch_imgs = torch.concat(batch_imgs, dim=0)
100 |             data['inputs']['imgs'] = batch_imgs
101 | 
102 |         data.setdefault('data_samples', None)
103 |         return data
104 | 
105 |     @torch.no_grad()
106 |     def voxelize(self, points: List[Tensor], data_samples: SampleList) -> dict:
107 |         voxel_dict = dict()
108 | 
109 |         voxels = []
110 |         coors = []
111 |         point2voxel_maps = []
112 |         voxel_inds = []
113 | 
114 |         voxel_size = points[0].new_tensor(self.voxel_size)
115 | 
116 |         for i, res in enumerate(points):
117 |             if self.voxel_type == 'cubic':
118 |                 res_coors = torch.round(res[:, :3] / voxel_size).int()
119 |             elif self.voxel_type == 'cylinder':
120 |                 rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
121 |                 phi = torch.atan2(res[:, 1], res[:, 0]) * 180 / np.pi
122 |                 polar_res = torch.stack((rho, phi, res[:, 2]), dim=1)
123 |                 res_coors = torch.round(polar_res[:, :3] / voxel_size).int()
124 | 
125 |             res_coors -= res_coors.min(0)[0]
126 | 
127 |             res_coors_numpy = res_coors.cpu().numpy()
128 |             inds, point2voxel_map = self.sparse_quantize(
129 |                 res_coors_numpy, return_index=True, return_inverse=True)
130 |             point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
131 |             inds = torch.from_numpy(inds).cuda()
132 |             res_voxel_coors = res_coors[inds]
133 |             res_voxels = res[inds]
134 |             res_voxel_coors = F.pad(
135 |                 res_voxel_coors, (0, 1), mode='constant', value=i)
136 |             voxels.append(res_voxels)
137 |             coors.append(res_voxel_coors)
138 |             point2voxel_maps.append(point2voxel_map)
139 |             voxel_inds.append(inds)
140 | 
141 |         voxels = torch.cat(voxels, dim=0)
142 |         coors = torch.cat(coors, dim=0)
143 | 
144 |         voxel_dict['voxels'] = voxels
145 |         voxel_dict['coors'] = coors
146 |         voxel_dict['point2voxel_maps'] = point2voxel_maps
147 |         voxel_dict['voxel_inds'] = voxel_inds
148 | 
149 |         return voxel_dict
150 | 
151 |     def ravel_hash(self, x: np.ndarray) -> np.ndarray:
152 |         assert x.ndim == 2, x.shape
153 | 
154 |         x = x - np.min(x, axis=0)
155 |         x = x.astype(np.uint64, copy=False)
156 |         xmax = np.max(x, axis=0).astype(np.uint64) + 1
157 | 
158 |         h = np.zeros(x.shape[0], dtype=np.uint64)
159 |         for k in range(x.shape[1] - 1):
160 |             h += x[:, k]
161 |             h *= xmax[k + 1]
162 |         h += x[:, -1]
163 |         return h
164 | 
165 |     def sparse_quantize(self,
166 |                         coords: np.ndarray,
167 |                         return_index: bool = False,
168 |                         return_inverse: bool = False) -> List[np.ndarray]:
169 |         _, indices, inverse_indices = np.unique(
170 |             self.ravel_hash(coords), return_index=True, return_inverse=True)
171 | 
172 |         outputs = []
173 |         if return_index:
174 |             outputs += [indices]
175 |         if return_inverse:
176 |             outputs += [inverse_indices]
177 |         return outputs
178 | 
179 |     @torch.no_grad()
180 |     def frustum_region_group(self, points: List[Tensor],
181 |                              data_samples: SampleList) -> dict:
182 |         range_dict = dict()
183 | 
184 |         coors = []
185 |         voxels = []
186 | 
187 |         for i, res in enumerate(points):
188 |             depth = torch.linalg.norm(res[:, :3], 2, dim=1)
189 |             yaw = -torch.atan2(res[:, 1], res[:, 0])
190 |             pitch = torch.arcsin(res[:, 2] / depth)
191 | 
192 |             coors_x = 0.5 * (yaw / np.pi + 1.0)
193 |             coors_y = 1.0 - (pitch + abs(self.fov_down)) / self.fov
194 | 
195 |             # scale to image size using angular resolution
196 |             coors_x *= self.W
197 |             coors_y *= self.H
198 | 
199 |             # round and clamp for use as index
200 |             coors_x = torch.floor(coors_x)
201 |             coors_x = torch.clamp(
202 |                 coors_x, min=0, max=self.W - 1).type(torch.int64)
203 | 
204 |             coors_y = torch.floor(coors_y)
205 |             coors_y = torch.clamp(
206 |                 coors_y, min=0, max=self.H - 1).type(torch.int64)
207 | 
208 |             res_coors = torch.stack([coors_y, coors_x], dim=1)
209 |             res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
210 |             coors.append(res_coors)
211 |             voxels.append(res)
212 | 
213 |             if 'pts_semantic_mask' in data_samples[i].gt_pts_seg:
214 |                 import torch_scatter
215 |                 pts_semantic_mask = data_samples[
216 |                     i].gt_pts_seg.pts_semantic_mask
217 |                 seg_label = torch.ones(
218 |                     (self.H, self.W),
219 |                     dtype=torch.long,
220 |                     device=pts_semantic_mask.device) * self.ignore_index
221 |                 res_voxel_coors, inverse_map = torch.unique(
222 |                     res_coors, return_inverse=True, dim=0)
223 |                 voxel_semantic_mask = torch_scatter.scatter_mean(
224 |                     F.one_hot(pts_semantic_mask).float(), inverse_map, dim=0)
225 |                 voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
226 |                 seg_label[res_voxel_coors[:, 1],
227 |                           res_voxel_coors[:, 2]] = voxel_semantic_mask
228 |                 data_samples[i].gt_pts_seg.semantic_seg = seg_label
229 | 
230 |         voxels = torch.cat(voxels, dim=0)
231 |         coors = torch.cat(coors, dim=0)
232 |         range_dict['voxels'] = voxels
233 |         range_dict['coors'] = coors
234 | 
235 |         return range_dict
236 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/layers/block.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Callable, Dict, List, Tuple
  3 | 
  4 | import torch
  5 | from torch import Tensor, nn
  6 | 
  7 | from .attention import Attention, MemEffAttention
  8 | from .drop_path import DropPath
  9 | from .layer_scale import LayerScale
 10 | from .mlp import Mlp
 11 | 
 12 | logger = logging.getLogger('dinov2')
 13 | 
 14 | try:
 15 |     from xformers.ops import fmha, index_select_cat, scaled_index_add
 16 | 
 17 |     XFORMERS_AVAILABLE = True
 18 | except ImportError:
 19 |     logger.warning('xFormers not available')
 20 |     XFORMERS_AVAILABLE = False
 21 | 
 22 | 
 23 | class Block(nn.Module):
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         dim: int,
 28 |         num_heads: int,
 29 |         mlp_ratio: float = 4.0,
 30 |         qkv_bias: bool = False,
 31 |         proj_bias: bool = True,
 32 |         ffn_bias: bool = True,
 33 |         drop: float = 0.0,
 34 |         attn_drop: float = 0.0,
 35 |         init_values=None,
 36 |         drop_path: float = 0.0,
 37 |         act_layer: Callable[..., nn.Module] = nn.GELU,
 38 |         norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
 39 |         attn_class: Callable[..., nn.Module] = Attention,
 40 |         ffn_layer: Callable[..., nn.Module] = Mlp,
 41 |     ) -> None:
 42 |         super().__init__()
 43 |         # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
 44 |         self.norm1 = norm_layer(dim)
 45 |         self.attn = attn_class(
 46 |             dim,
 47 |             num_heads=num_heads,
 48 |             qkv_bias=qkv_bias,
 49 |             proj_bias=proj_bias,
 50 |             attn_drop=attn_drop,
 51 |             proj_drop=drop,
 52 |         )
 53 |         self.ls1 = (
 54 |             LayerScale(dim, init_values=init_values)
 55 |             if init_values else nn.Identity())
 56 |         self.drop_path1 = DropPath(
 57 |             drop_path) if drop_path > 0.0 else nn.Identity()
 58 | 
 59 |         self.norm2 = norm_layer(dim)
 60 |         mlp_hidden_dim = int(dim * mlp_ratio)
 61 |         self.mlp = ffn_layer(
 62 |             in_features=dim,
 63 |             hidden_features=mlp_hidden_dim,
 64 |             act_layer=act_layer,
 65 |             drop=drop,
 66 |             bias=ffn_bias,
 67 |         )
 68 |         self.ls2 = (
 69 |             LayerScale(dim, init_values=init_values)
 70 |             if init_values else nn.Identity())
 71 |         self.drop_path2 = DropPath(
 72 |             drop_path) if drop_path > 0.0 else nn.Identity()
 73 | 
 74 |         self.sample_drop_ratio = drop_path
 75 | 
 76 |     def forward(self, x: Tensor) -> Tensor:
 77 | 
 78 |         def attn_residual_func(x: Tensor) -> Tensor:
 79 |             return self.ls1(self.attn(self.norm1(x)))
 80 | 
 81 |         def ffn_residual_func(x: Tensor) -> Tensor:
 82 |             return self.ls2(self.mlp(self.norm2(x)))
 83 | 
 84 |         if self.training and self.sample_drop_ratio > 0.1:
 85 |             # the overhead is compensated only for a drop path rate
 86 |             # larger than 0.1
 87 |             x = drop_add_residual_stochastic_depth(
 88 |                 x,
 89 |                 residual_func=attn_residual_func,
 90 |                 sample_drop_ratio=self.sample_drop_ratio,
 91 |             )
 92 |             x = drop_add_residual_stochastic_depth(
 93 |                 x,
 94 |                 residual_func=ffn_residual_func,
 95 |                 sample_drop_ratio=self.sample_drop_ratio,
 96 |             )
 97 |         elif self.training and self.sample_drop_ratio > 0.0:
 98 |             x = x + self.drop_path1(attn_residual_func(x))
 99 |             x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
100 |         else:
101 |             x = x + attn_residual_func(x)
102 |             x = x + ffn_residual_func(x)
103 |         return x
104 | 
105 | 
106 | def drop_add_residual_stochastic_depth(
107 |     x: Tensor,
108 |     residual_func: Callable[[Tensor], Tensor],
109 |     sample_drop_ratio: float = 0.0,
110 | ) -> Tensor:
111 |     # 1) extract subset using permutation
112 |     b, n, d = x.shape
113 |     sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
114 |     brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
115 |     x_subset = x[brange]
116 | 
117 |     # 2) apply residual_func to get residual
118 |     residual = residual_func(x_subset)
119 | 
120 |     x_flat = x.flatten(1)
121 |     residual = residual.flatten(1)
122 | 
123 |     residual_scale_factor = b / sample_subset_size
124 | 
125 |     # 3) add the residual
126 |     x_plus_residual = torch.index_add(
127 |         x_flat,
128 |         0,
129 |         brange,
130 |         residual.to(dtype=x.dtype),
131 |         alpha=residual_scale_factor)
132 |     return x_plus_residual.view_as(x)
133 | 
134 | 
135 | def get_branges_scales(x, sample_drop_ratio=0.0):
136 |     b, n, d = x.shape
137 |     sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
138 |     brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
139 |     residual_scale_factor = b / sample_subset_size
140 |     return brange, residual_scale_factor
141 | 
142 | 
143 | def add_residual(x,
144 |                  brange,
145 |                  residual,
146 |                  residual_scale_factor,
147 |                  scaling_vector=None):
148 |     if scaling_vector is None:
149 |         x_flat = x.flatten(1)
150 |         residual = residual.flatten(1)
151 |         x_plus_residual = torch.index_add(
152 |             x_flat,
153 |             0,
154 |             brange,
155 |             residual.to(dtype=x.dtype),
156 |             alpha=residual_scale_factor)
157 |     else:
158 |         x_plus_residual = scaled_index_add(
159 |             x,
160 |             brange,
161 |             residual.to(dtype=x.dtype),
162 |             scaling=scaling_vector,
163 |             alpha=residual_scale_factor,
164 |         )
165 |     return x_plus_residual
166 | 
167 | 
168 | attn_bias_cache: Dict[Tuple, Any] = {}
169 | 
170 | 
171 | def get_attn_bias_and_cat(x_list, branges=None):
172 |     """this will perform the index select, cat the tensors, and provide the
173 |     attn_bias from cache."""
174 |     batch_sizes = ([b.shape[0] for b in branges]
175 |                    if branges is not None else [x.shape[0] for x in x_list])
176 |     all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
177 |     if all_shapes not in attn_bias_cache.keys():
178 |         seqlens = []
179 |         for b, x in zip(batch_sizes, x_list):
180 |             for _ in range(b):
181 |                 seqlens.append(x.shape[1])
182 |         attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
183 |         attn_bias._batch_sizes = batch_sizes
184 |         attn_bias_cache[all_shapes] = attn_bias
185 | 
186 |     if branges is not None:
187 |         cat_tensors = index_select_cat([x.flatten(1) for x in x_list],
188 |                                        branges).view(1, -1,
189 |                                                      x_list[0].shape[-1])
190 |     else:
191 |         tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
192 |         cat_tensors = torch.cat(tensors_bs1, dim=1)
193 | 
194 |     return attn_bias_cache[all_shapes], cat_tensors
195 | 
196 | 
197 | def drop_add_residual_stochastic_depth_list(
198 |     x_list: List[Tensor],
199 |     residual_func: Callable[[Tensor, Any], Tensor],
200 |     sample_drop_ratio: float = 0.0,
201 |     scaling_vector=None,
202 | ) -> Tensor:
203 |     # 1) generate random set of indices for dropping samples in the batch
204 |     branges_scales = [
205 |         get_branges_scales(x, sample_drop_ratio=sample_drop_ratio)
206 |         for x in x_list
207 |     ]
208 |     branges = [s[0] for s in branges_scales]
209 |     residual_scale_factors = [s[1] for s in branges_scales]
210 | 
211 |     # 2) get attention bias and index+concat the tensors
212 |     attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
213 | 
214 |     # 3) apply residual_func to get residual, and split the result
215 |     residual_list = attn_bias.split(residual_func(
216 |         x_cat, attn_bias=attn_bias))  # type: ignore
217 | 
218 |     outputs = []
219 |     for x, brange, residual, residual_scale_factor in zip(
220 |             x_list, branges, residual_list, residual_scale_factors):
221 |         outputs.append(
222 |             add_residual(x, brange, residual, residual_scale_factor,
223 |                          scaling_vector).view_as(x))
224 |     return outputs
225 | 
226 | 
227 | class NestedTensorBlock(Block):
228 | 
229 |     def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
230 |         """x_list contains a list of tensors to nest together and run."""
231 |         assert isinstance(self.attn, MemEffAttention)
232 | 
233 |         if self.training and self.sample_drop_ratio > 0.0:
234 | 
235 |             def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
236 |                 return self.attn(self.norm1(x), attn_bias=attn_bias)
237 | 
238 |             def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
239 |                 return self.mlp(self.norm2(x))
240 | 
241 |             x_list = drop_add_residual_stochastic_depth_list(
242 |                 x_list,
243 |                 residual_func=attn_residual_func,
244 |                 sample_drop_ratio=self.sample_drop_ratio,
245 |                 scaling_vector=self.ls1.gamma if isinstance(
246 |                     self.ls1, LayerScale) else None,
247 |             )
248 |             x_list = drop_add_residual_stochastic_depth_list(
249 |                 x_list,
250 |                 residual_func=ffn_residual_func,
251 |                 sample_drop_ratio=self.sample_drop_ratio,
252 |                 scaling_vector=self.ls2.gamma if isinstance(
253 |                     self.ls1, LayerScale) else None,
254 |             )
255 |             return x_list
256 |         else:
257 | 
258 |             def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
259 |                 return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
260 | 
261 |             def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
262 |                 return self.ls2(self.mlp(self.norm2(x)))
263 | 
264 |             attn_bias, x = get_attn_bias_and_cat(x_list)
265 |             x = x + attn_residual_func(x, attn_bias=attn_bias)
266 |             x = x + ffn_residual_func(x)
267 |             return attn_bias.split(x)
268 | 
269 |     def forward(self, x_or_x_list):
270 |         if isinstance(x_or_x_list, Tensor):
271 |             return super().forward(x_or_x_list)
272 |         elif isinstance(x_or_x_list, list):
273 |             assert (XFORMERS_AVAILABLE
274 |                     ), 'Please install xFormers for nested tensors usage'
275 |             return self.forward_nested(x_or_x_list)
276 |         else:
277 |             raise AssertionError
278 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2018-2019 Open-MMLab. All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2018-2019 Open-MMLab.
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/minkunet.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence
  2 | 
  3 | import torch.nn as nn
  4 | import torchsparse
  5 | import torchsparse.nn as spnn
  6 | from mmdet3d.registry import MODELS
  7 | from mmdet3d.utils import OptMultiConfig
  8 | from mmengine.model import BaseModule
  9 | from torchsparse.tensor import SparseTensor
 10 | 
 11 | 
 12 | class TorchsparseConvModule(BaseModule):
 13 | 
 14 |     def __init__(self,
 15 |                  inplanes: int,
 16 |                  planes: int,
 17 |                  kernel_size: int,
 18 |                  stride: int = 1,
 19 |                  dilation: int = 1,
 20 |                  bn_momentum: float = 0.1,
 21 |                  transposed: bool = False,
 22 |                  activate: bool = True,
 23 |                  init_cfg: OptMultiConfig = None) -> None:
 24 |         super(TorchsparseConvModule, self).__init__(init_cfg=init_cfg)
 25 | 
 26 |         self.conv = spnn.Conv3d(
 27 |             inplanes,
 28 |             planes,
 29 |             kernel_size=kernel_size,
 30 |             stride=stride,
 31 |             dilation=dilation,
 32 |             bias=False,
 33 |             transposed=transposed)
 34 |         self.norm = spnn.BatchNorm(planes, momentum=bn_momentum)
 35 |         if activate:
 36 |             self.relu = spnn.ReLU(inplace=True)
 37 |         else:
 38 |             self.relu = None
 39 | 
 40 |     def forward(self, x: SparseTensor) -> SparseTensor:
 41 |         out = self.conv(x)
 42 |         out = self.norm(out)
 43 |         if self.relu is not None:
 44 |             out = self.relu(out)
 45 |         return out
 46 | 
 47 | 
 48 | class TorchsparseBasicBlock(BaseModule):
 49 |     expansion = 1
 50 | 
 51 |     def __init__(self,
 52 |                  inplanes: int,
 53 |                  planes: int,
 54 |                  stride: int = 1,
 55 |                  dilation: int = 1,
 56 |                  downsample: Optional[nn.Module] = None,
 57 |                  bn_momentum: float = 0.1,
 58 |                  init_cfg: OptMultiConfig = None) -> None:
 59 |         super(TorchsparseBasicBlock, self).__init__(init_cfg=init_cfg)
 60 | 
 61 |         self.conv1 = spnn.Conv3d(
 62 |             inplanes,
 63 |             planes,
 64 |             kernel_size=3,
 65 |             stride=stride,
 66 |             dilation=dilation,
 67 |             bias=False)
 68 |         self.norm1 = spnn.BatchNorm(planes, momentum=bn_momentum)
 69 | 
 70 |         self.conv2 = spnn.Conv3d(
 71 |             planes,
 72 |             planes,
 73 |             kernel_size=3,
 74 |             stride=1,
 75 |             dilation=dilation,
 76 |             bias=False)
 77 |         self.norm2 = spnn.BatchNorm(planes, momentum=bn_momentum)
 78 |         self.relu = spnn.ReLU(inplace=True)
 79 |         self.downsample = downsample
 80 | 
 81 |     def forward(self, x: SparseTensor) -> SparseTensor:
 82 |         residual = x
 83 | 
 84 |         out = self.conv1(x)
 85 |         out = self.norm1(out)
 86 |         out = self.relu(out)
 87 | 
 88 |         out = self.conv2(out)
 89 |         out = self.norm2(out)
 90 | 
 91 |         if self.downsample is not None:
 92 |             residual = self.downsample(x)
 93 | 
 94 |         out += residual
 95 |         out = self.relu(out)
 96 |         return out
 97 | 
 98 | 
 99 | class TorchsparseBottleneck(BaseModule):
100 |     expansion = 4
101 | 
102 |     def __init__(self,
103 |                  inplanes: int,
104 |                  planes: int,
105 |                  stride: int = 1,
106 |                  dilation: int = 1,
107 |                  downsample: Optional[nn.Module] = None,
108 |                  bn_momentum: float = 0.1,
109 |                  init_cfg: OptMultiConfig = None) -> None:
110 |         super(TorchsparseBottleneck, self).__init__(init_cfg=init_cfg)
111 | 
112 |         self.conv1 = spnn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
113 |         self.norm1 = spnn.BatchNorm(planes, momentum=bn_momentum)
114 | 
115 |         self.conv2 = spnn.Conv3d(
116 |             planes,
117 |             planes,
118 |             kernel_size=3,
119 |             stride=stride,
120 |             dilation=dilation,
121 |             bias=False)
122 |         self.norm2 = spnn.BatchNorm(planes, momentum=bn_momentum)
123 | 
124 |         self.conv3 = spnn.Conv3d(
125 |             planes, planes * self.expansion, kernel_size=1, bias=False)
126 |         self.norm3 = spnn.BatchNorm(
127 |             planes * self.expansion, momentum=bn_momentum)
128 | 
129 |         self.relu = spnn.ReLU(inplace=True)
130 |         self.downsample = downsample
131 | 
132 |     def forward(self, x: SparseTensor) -> SparseTensor:
133 |         residual = x
134 | 
135 |         out = self.conv1(x)
136 |         out = self.norm1(out)
137 |         out = self.relu(out)
138 | 
139 |         out = self.conv2(out)
140 |         out = self.norm2(out)
141 |         out = self.relu(out)
142 | 
143 |         out = self.conv3(out)
144 |         out = self.norm3(out)
145 | 
146 |         if self.downsample is not None:
147 |             residual = self.downsample(x)
148 | 
149 |         out += residual
150 |         out = self.relu(out)
151 |         return out
152 | 
153 | 
154 | @MODELS.register_module()
155 | class MinkUNetBackbone(BaseModule):
156 | 
157 |     def __init__(self,
158 |                  in_channels: int = 4,
159 |                  base_channels: int = 32,
160 |                  layers: Sequence[int] = [2, 3, 4, 6, 2, 2, 2, 2],
161 |                  planes: Sequence[int] = [32, 64, 128, 256, 256, 128, 96, 96],
162 |                  block_type: str = 'basic',
163 |                  bn_momentum: float = 0.1,
164 |                  init_cfg: OptMultiConfig = None) -> None:
165 |         super(MinkUNetBackbone, self).__init__(init_cfg=init_cfg)
166 |         assert block_type in ['basic', 'bottleneck']
167 | 
168 |         conv_module = TorchsparseConvModule
169 |         if block_type == 'basic':
170 |             block = TorchsparseBasicBlock
171 |         elif block_type == 'bottleneck':
172 |             block = TorchsparseBottleneck
173 | 
174 |         self.conv0 = nn.Sequential(
175 |             conv_module(
176 |                 in_channels,
177 |                 base_channels,
178 |                 kernel_size=3,
179 |                 bn_momentum=bn_momentum),
180 |             conv_module(
181 |                 base_channels,
182 |                 base_channels,
183 |                 kernel_size=3,
184 |                 bn_momentum=bn_momentum))
185 | 
186 |         self.inplanes = base_channels
187 | 
188 |         self.conv1 = conv_module(
189 |             self.inplanes,
190 |             self.inplanes,
191 |             kernel_size=2,
192 |             stride=2,
193 |             dilation=1,
194 |             bn_momentum=bn_momentum)
195 |         self.block1 = self._make_layer(
196 |             block, conv_module, planes[0], layers[0], bn_momentum=bn_momentum)
197 | 
198 |         self.conv2 = conv_module(
199 |             self.inplanes,
200 |             self.inplanes,
201 |             kernel_size=2,
202 |             stride=2,
203 |             dilation=1,
204 |             bn_momentum=bn_momentum)
205 |         self.block2 = self._make_layer(
206 |             block, conv_module, planes[1], layers[1], bn_momentum=bn_momentum)
207 | 
208 |         self.conv3 = conv_module(
209 |             self.inplanes,
210 |             self.inplanes,
211 |             kernel_size=2,
212 |             stride=2,
213 |             dilation=1,
214 |             bn_momentum=bn_momentum)
215 |         self.block3 = self._make_layer(
216 |             block, conv_module, planes[2], layers[2], bn_momentum=bn_momentum)
217 | 
218 |         self.conv4 = conv_module(
219 |             self.inplanes,
220 |             self.inplanes,
221 |             kernel_size=2,
222 |             stride=2,
223 |             dilation=1,
224 |             bn_momentum=bn_momentum)
225 |         self.block4 = self._make_layer(
226 |             block, conv_module, planes[3], layers[3], bn_momentum=bn_momentum)
227 | 
228 |         self.conv5 = conv_module(
229 |             self.inplanes,
230 |             planes[4],
231 |             kernel_size=2,
232 |             stride=2,
233 |             dilation=1,
234 |             bn_momentum=bn_momentum,
235 |             transposed=True)
236 |         self.inplanes = planes[4] + planes[2] * block.expansion
237 |         self.block5 = self._make_layer(
238 |             block, conv_module, planes[4], layers[4], bn_momentum=bn_momentum)
239 | 
240 |         self.conv6 = conv_module(
241 |             self.inplanes,
242 |             planes[5],
243 |             kernel_size=2,
244 |             stride=2,
245 |             dilation=1,
246 |             bn_momentum=bn_momentum,
247 |             transposed=True)
248 |         self.inplanes = planes[5] + planes[1] * block.expansion
249 |         self.block6 = self._make_layer(
250 |             block, conv_module, planes[5], layers[5], bn_momentum=bn_momentum)
251 | 
252 |         self.conv7 = conv_module(
253 |             self.inplanes,
254 |             planes[6],
255 |             kernel_size=2,
256 |             stride=2,
257 |             dilation=1,
258 |             bn_momentum=bn_momentum,
259 |             transposed=True)
260 |         self.inplanes = planes[6] + planes[0] * block.expansion
261 |         self.block7 = self._make_layer(
262 |             block, conv_module, planes[6], layers[6], bn_momentum=bn_momentum)
263 | 
264 |         self.conv8 = conv_module(
265 |             self.inplanes,
266 |             planes[7],
267 |             kernel_size=2,
268 |             stride=2,
269 |             dilation=1,
270 |             bn_momentum=bn_momentum,
271 |             transposed=True)
272 |         self.inplanes = planes[7] + base_channels
273 |         self.block8 = self._make_layer(
274 |             block, conv_module, planes[7], layers[7], bn_momentum=bn_momentum)
275 | 
276 |     def _make_layer(self,
277 |                     block: nn.Module,
278 |                     conv_module: nn.Module,
279 |                     planes: int,
280 |                     blocks: int,
281 |                     stride: int = 1,
282 |                     dilation: int = 1,
283 |                     bn_momentum: float = 0.1) -> nn.Module:
284 |         downsample = None
285 |         if stride != 1 or self.inplanes != planes * block.expansion:
286 |             downsample = conv_module(
287 |                 self.inplanes,
288 |                 planes * block.expansion,
289 |                 kernel_size=1,
290 |                 stride=stride,
291 |                 bn_momentum=bn_momentum,
292 |                 activate=False)
293 |         layers = []
294 | 
295 |         layers.append(
296 |             block(
297 |                 self.inplanes,
298 |                 planes,
299 |                 stride=stride,
300 |                 dilation=dilation,
301 |                 bn_momentum=bn_momentum,
302 |                 downsample=downsample))
303 | 
304 |         self.inplanes = planes * block.expansion
305 |         for i in range(1, blocks):
306 |             layers.append(
307 |                 block(
308 |                     self.inplanes,
309 |                     planes,
310 |                     stride=1,
311 |                     dilation=dilation,
312 |                     bn_momentum=bn_momentum))
313 | 
314 |         return nn.Sequential(*layers)
315 | 
316 |     def forward(self, feat_dict: dict) -> dict:
317 |         voxel_features = feat_dict['voxels']
318 |         coors = feat_dict['coors']
319 |         x = torchsparse.SparseTensor(voxel_features, coors)
320 | 
321 |         out1 = self.conv0(x)
322 | 
323 |         out = self.conv1(out1)
324 |         out2 = self.block1(out)
325 | 
326 |         out = self.conv2(out2)
327 |         out3 = self.block2(out)
328 | 
329 |         out = self.conv3(out3)
330 |         out4 = self.block3(out)
331 | 
332 |         out = self.conv4(out4)
333 |         out5 = self.block4(out)
334 | 
335 |         out = self.conv5(out5)
336 |         out = torchsparse.cat((out, out4))
337 |         out = self.block5(out)
338 | 
339 |         out = self.conv6(out)
340 |         out = torchsparse.cat((out, out3))
341 |         out = self.block6(out)
342 | 
343 |         out = self.conv7(out)
344 |         out = torchsparse.cat((out, out2))
345 |         out = self.block7(out)
346 | 
347 |         out = self.conv8(out)
348 |         out = torchsparse.cat((out, out1))
349 |         out = self.block8(out)
350 | 
351 |         feat_dict['voxel_feats'] = out.F
352 |         return feat_dict
353 | 


--------------------------------------------------------------------------------
/limoe/models/backbones/dinov2/dinov2_vision_transformer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | from functools import partial
  4 | from typing import Callable, Sequence, Tuple, Union
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from torch.nn.init import trunc_normal_
  9 | 
 10 | from .layers import MemEffAttention, Mlp
 11 | from .layers import NestedTensorBlock as Block
 12 | from .layers import PatchEmbed, SwiGLUFFNFused
 13 | 
 14 | logger = logging.getLogger('dinov2')
 15 | 
 16 | 
 17 | def named_apply(fn: Callable,
 18 |                 module: nn.Module,
 19 |                 name='',
 20 |                 depth_first=True,
 21 |                 include_root=False) -> nn.Module:
 22 |     if not depth_first and include_root:
 23 |         fn(module=module, name=name)
 24 |     for child_name, child_module in module.named_children():
 25 |         child_name = '.'.join((name, child_name)) if name else child_name
 26 |         named_apply(
 27 |             fn=fn,
 28 |             module=child_module,
 29 |             name=child_name,
 30 |             depth_first=depth_first,
 31 |             include_root=True,
 32 |         )
 33 |     if depth_first and include_root:
 34 |         fn(module=module, name=name)
 35 |     return module
 36 | 
 37 | 
 38 | class BlockChunk(nn.ModuleList):
 39 | 
 40 |     def forward(self, x):
 41 |         for b in self:
 42 |             x = b(x)
 43 |         return x
 44 | 
 45 | 
 46 | class DinoVisionTransformer(nn.Module):
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         img_size=224,
 51 |         patch_size=16,
 52 |         in_chans=3,
 53 |         embed_dim=768,
 54 |         depth=12,
 55 |         num_heads=12,
 56 |         mlp_ratio=4.0,
 57 |         qkv_bias=True,
 58 |         ffn_bias=True,
 59 |         proj_bias=True,
 60 |         drop_path_rate=0.0,
 61 |         drop_path_uniform=False,
 62 |         init_values=None,  # for layerscale: None or 0 => no layerscale
 63 |         embed_layer=PatchEmbed,
 64 |         act_layer=nn.GELU,
 65 |         block_fn=Block,
 66 |         ffn_layer='mlp',
 67 |         block_chunks=1,
 68 |     ):
 69 |         """
 70 |         Args:
 71 |             img_size (int, tuple): input image size
 72 |             patch_size (int, tuple): patch size
 73 |             in_chans (int): number of input channels
 74 |             embed_dim (int): embedding dimension
 75 |             depth (int): depth of transformer
 76 |             num_heads (int): number of attention heads
 77 |             mlp_ratio (int): ratio of mlp hidden dim to embedding dim
 78 |             qkv_bias (bool): enable bias for qkv if True
 79 |             proj_bias (bool): enable bias for proj in attn if True
 80 |             ffn_bias (bool): enable bias for ffn if True
 81 |             drop_path_rate (float): stochastic depth rate
 82 |             drop_path_uniform (bool): apply uniform drop rate across blocks
 83 |             weight_init (str): weight init scheme
 84 |             init_values (float): layer-scale init values
 85 |             embed_layer (nn.Module): patch embedding layer
 86 |             act_layer (nn.Module): MLP activation layer
 87 |             block_fn (nn.Module): transformer block class
 88 |             ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
 89 |             block_chunks: (int) split block sequence into block_chunks units
 90 |                 for FSDP wrap
 91 |         """
 92 |         super().__init__()
 93 |         norm_layer = partial(nn.LayerNorm, eps=1e-6)
 94 | 
 95 |         self.num_features = (
 96 |             self.embed_dim
 97 |         ) = embed_dim  # num_features for consistency with other models
 98 |         self.num_tokens = 1
 99 |         self.n_blocks = depth
100 |         self.num_heads = num_heads
101 |         self.patch_size = patch_size
102 | 
103 |         self.patch_embed = embed_layer(
104 |             img_size=img_size,
105 |             patch_size=patch_size,
106 |             in_chans=in_chans,
107 |             embed_dim=embed_dim,
108 |         )
109 |         num_patches = self.patch_embed.num_patches
110 | 
111 |         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
112 |         self.pos_embed = nn.Parameter(
113 |             torch.zeros(1, num_patches + self.num_tokens, embed_dim))
114 | 
115 |         if drop_path_uniform is True:
116 |             dpr = [drop_path_rate] * depth
117 |         else:
118 |             dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
119 |                    ]  # stochastic depth decay rule
120 | 
121 |         if ffn_layer == 'mlp':
122 |             logger.info('using MLP layer as FFN')
123 |             ffn_layer = Mlp
124 |         elif ffn_layer == 'swiglufused' or ffn_layer == 'swiglu':
125 |             logger.info('using SwiGLU layer as FFN')
126 |             ffn_layer = SwiGLUFFNFused
127 |         elif ffn_layer == 'identity':
128 |             logger.info('using Identity layer as FFN')
129 | 
130 |             def f(*args, **kwargs):
131 |                 return nn.Identity()
132 | 
133 |             ffn_layer = f
134 |         else:
135 |             raise NotImplementedError
136 | 
137 |         blocks_list = [
138 |             block_fn(
139 |                 dim=embed_dim,
140 |                 num_heads=num_heads,
141 |                 mlp_ratio=mlp_ratio,
142 |                 qkv_bias=qkv_bias,
143 |                 proj_bias=proj_bias,
144 |                 ffn_bias=ffn_bias,
145 |                 drop_path=dpr[i],
146 |                 norm_layer=norm_layer,
147 |                 act_layer=act_layer,
148 |                 ffn_layer=ffn_layer,
149 |                 init_values=init_values,
150 |             ) for i in range(depth)
151 |         ]
152 |         if block_chunks > 0:
153 |             self.chunked_blocks = True
154 |             chunked_blocks = []
155 |             chunksize = depth // block_chunks
156 |             for i in range(0, depth, chunksize):
157 |                 # this is to keep the block index consistent
158 |                 # if we chunk the block list
159 |                 chunked_blocks.append([nn.Identity()] * i +
160 |                                       blocks_list[i:i + chunksize])
161 |             self.blocks = nn.ModuleList(
162 |                 [BlockChunk(p) for p in chunked_blocks])
163 |         else:
164 |             self.chunked_blocks = False
165 |             self.blocks = nn.ModuleList(blocks_list)
166 | 
167 |         self.norm = norm_layer(embed_dim)
168 |         self.head = nn.Identity()
169 | 
170 |         self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
171 | 
172 |         self.init_weights()
173 | 
174 |     def init_weights(self):
175 |         trunc_normal_(self.pos_embed, std=0.02)
176 |         nn.init.normal_(self.cls_token, std=1e-6)
177 |         named_apply(init_weights_vit_timm, self)
178 | 
179 |     def interpolate_pos_encoding(self, x, w, h):
180 |         previous_dtype = x.dtype
181 |         npatch = x.shape[1] - 1
182 |         N = self.pos_embed.shape[1] - 1
183 |         if npatch == N and w == h:
184 |             return self.pos_embed
185 |         pos_embed = self.pos_embed.float()
186 |         class_pos_embed = pos_embed[:, 0]
187 |         patch_pos_embed = pos_embed[:, 1:]
188 |         dim = x.shape[-1]
189 |         w0 = w // self.patch_size
190 |         h0 = h // self.patch_size
191 |         # we add a small number to avoid floating point error in the
192 |         # interpolation
193 |         # see discussion at https://github.com/facebookresearch/dino/issues/8
194 |         w0, h0 = w0 + 0.1, h0 + 0.1
195 | 
196 |         patch_pos_embed = nn.functional.interpolate(
197 |             patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
198 |                                     dim).permute(0, 3, 1, 2),
199 |             scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
200 |             mode='bicubic',
201 |         )
202 | 
203 |         assert (int(w0) == patch_pos_embed.shape[-2]
204 |                 and int(h0) == patch_pos_embed.shape[-1])
205 |         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
206 |         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
207 |                          dim=1).to(previous_dtype)
208 | 
209 |     def prepare_tokens_with_masks(self, x, masks=None):
210 |         B, nc, w, h = x.shape
211 |         x = self.patch_embed(x)
212 |         if masks is not None:
213 |             x = torch.where(
214 |                 masks.unsqueeze(-1),
215 |                 self.mask_token.to(x.dtype).unsqueeze(0), x)
216 | 
217 |         x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
218 |         x = x + self.interpolate_pos_encoding(x, w, h)
219 | 
220 |         return x
221 | 
222 |     def forward_features_list(self, x_list, masks_list):
223 |         x = [
224 |             self.prepare_tokens_with_masks(x, masks)
225 |             for x, masks in zip(x_list, masks_list)
226 |         ]
227 |         for blk in self.blocks:
228 |             x = blk(x)
229 | 
230 |         all_x = x
231 |         output = []
232 |         for x, masks in zip(all_x, masks_list):
233 |             x_norm = self.norm(x)
234 |             output.append({
235 |                 'x_norm_clstoken': x_norm[:, 0],
236 |                 'x_norm_patchtokens': x_norm[:, 1:],
237 |                 'x_prenorm': x,
238 |                 'masks': masks,
239 |             })
240 |         return output
241 | 
242 |     def forward_features(self, x, masks=None):
243 |         if isinstance(x, list):
244 |             return self.forward_features_list(x, masks)
245 | 
246 |         x = self.prepare_tokens_with_masks(x, masks)
247 | 
248 |         for blk in self.blocks:
249 |             x = blk(x)
250 | 
251 |         x_norm = self.norm(x)
252 |         return {
253 |             'x_norm_clstoken': x_norm[:, 0],
254 |             'x_norm_patchtokens': x_norm[:, 1:],
255 |             'x_prenorm': x,
256 |             'masks': masks,
257 |         }
258 | 
259 |     def forward_get_last_n(self, x, n=1):
260 |         x = self.prepare_tokens_with_masks(x)
261 |         # If n is an int, take the n last blocks. If it's a list, take them
262 |         total_block_len = len(self.blocks)
263 |         blocks_to_take = (
264 |             range(total_block_len -
265 |                   n, total_block_len) if isinstance(n, int) else n)
266 |         output = {'x': [], 'x_pre_norm': []}
267 |         for i, blk in enumerate(self.blocks):
268 |             x = blk(x)
269 |             if i in blocks_to_take:
270 |                 output['x'].append(self.norm(x))
271 |                 output['x_pre_norm'].append(x)
272 | 
273 |         return output
274 | 
275 |     def _get_intermediate_layers_not_chunked(self, x, n=1):
276 |         x = self.prepare_tokens_with_masks(x)
277 |         # If n is an int, take the n last blocks. If it's a list, take them
278 |         output, total_block_len = [], len(self.blocks)
279 |         blocks_to_take = (
280 |             range(total_block_len -
281 |                   n, total_block_len) if isinstance(n, int) else n)
282 |         for i, blk in enumerate(self.blocks):
283 |             x = blk(x)
284 |             if i in blocks_to_take:
285 |                 output.append(x)
286 |         assert len(output) == len(
287 |             blocks_to_take
288 |         ), f'only {len(output)} / {len(blocks_to_take)} blocks found'
289 |         return output
290 | 
291 |     def _get_intermediate_layers_chunked(self, x, n=1):
292 |         x = self.prepare_tokens_with_masks(x)
293 |         output, i, total_block_len = [], 0, len(self.blocks[-1])
294 |         # If n is an int, take the n last blocks. If it's a list, take them
295 |         blocks_to_take = (
296 |             range(total_block_len -
297 |                   n, total_block_len) if isinstance(n, int) else n)
298 |         for block_chunk in self.blocks:
299 |             for blk in block_chunk[i:]:  # Passing the nn.Identity()
300 |                 x = blk(x)
301 |                 if i in blocks_to_take:
302 |                     output.append(x)
303 |                 i += 1
304 |         assert len(output) == len(
305 |             blocks_to_take
306 |         ), f'only {len(output)} / {len(blocks_to_take)} blocks found'
307 |         return output
308 | 
309 |     def get_intermediate_layers(
310 |         self,
311 |         x: torch.Tensor,
312 |         n: Union[int, Sequence] = 1,  # Layers or n last layers to take
313 |         reshape: bool = False,
314 |         return_class_token: bool = False,
315 |         norm=True,
316 |     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
317 |         if self.chunked_blocks:
318 |             outputs = self._get_intermediate_layers_chunked(x, n)
319 |         else:
320 |             outputs = self._get_intermediate_layers_not_chunked(x, n)
321 |         if norm:
322 |             outputs = [self.norm(out) for out in outputs]
323 |         class_tokens = [out[:, 0] for out in outputs]
324 |         outputs = [out[:, 1:] for out in outputs]
325 |         if reshape:
326 |             B, _, w, h = x.shape
327 |             outputs = [
328 |                 out.reshape(B, w // self.patch_size, h // self.patch_size,
329 |                             -1).permute(0, 3, 1, 2).contiguous()
330 |                 for out in outputs
331 |             ]
332 |         if return_class_token:
333 |             return tuple(zip(outputs, class_tokens))
334 |         return tuple(outputs)
335 | 
336 |     def forward(self, *args, is_training=False, **kwargs):
337 |         ret = self.forward_features(*args, **kwargs)
338 |         if is_training:
339 |             return ret
340 |         else:
341 |             return self.head(ret['x_norm_clstoken'])
342 | 
343 | 
344 | def init_weights_vit_timm(module: nn.Module, name: str = ''):
345 |     """ViT weight initialization, original timm impl (for reproducibility)"""
346 |     if isinstance(module, nn.Linear):
347 |         trunc_normal_(module.weight, std=0.02)
348 |         if module.bias is not None:
349 |             nn.init.zeros_(module.bias)
350 | 
351 | 
352 | def vit_small(patch_size=16, **kwargs):
353 |     model = DinoVisionTransformer(
354 |         patch_size=patch_size,
355 |         embed_dim=384,
356 |         depth=12,
357 |         num_heads=6,
358 |         mlp_ratio=4,
359 |         block_fn=partial(Block, attn_class=MemEffAttention),
360 |         **kwargs,
361 |     )
362 |     return model
363 | 
364 | 
365 | def vit_base(patch_size=16, **kwargs):
366 |     model = DinoVisionTransformer(
367 |         patch_size=patch_size,
368 |         embed_dim=768,
369 |         depth=12,
370 |         num_heads=12,
371 |         mlp_ratio=4,
372 |         block_fn=partial(Block, attn_class=MemEffAttention),
373 |         **kwargs,
374 |     )
375 |     return model
376 | 
377 | 
378 | def vit_large(patch_size=16, **kwargs):
379 |     model = DinoVisionTransformer(
380 |         patch_size=patch_size,
381 |         embed_dim=1024,
382 |         depth=24,
383 |         num_heads=16,
384 |         mlp_ratio=4,
385 |         block_fn=partial(Block, attn_class=MemEffAttention),
386 |         **kwargs,
387 |     )
388 |     return model
389 | 
390 | 
391 | def vit_giant2(patch_size=16, **kwargs):
392 |     """Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per
393 |     head 64."""
394 |     model = DinoVisionTransformer(
395 |         patch_size=patch_size,
396 |         embed_dim=1536,
397 |         depth=40,
398 |         num_heads=24,
399 |         mlp_ratio=4,
400 |         block_fn=partial(Block, attn_class=MemEffAttention),
401 |         **kwargs,
402 |     )
403 |     return model
404 | 
405 | 
406 | _DINOV2_BASE_URL = 'https://dl.fbaipublicfiles.com/dinov2'
407 | 
408 | 
409 | def _make_dinov2_model_name(arch_name: str, patch_size: int) -> str:
410 |     compact_arch_name = arch_name.replace('_', '')[:4]
411 |     return f'dinov2_{compact_arch_name}{patch_size}'
412 | 
413 | 
414 | def _make_dinov2_model(
415 |     *,
416 |     arch_name: str = 'vit_large',
417 |     img_size: int = 518,
418 |     patch_size: int = 14,
419 |     init_values: float = 1.0,
420 |     ffn_layer: str = 'mlp',
421 |     block_chunks: int = 0,
422 |     pretrained: bool = True,
423 |     **kwargs,
424 | ):
425 | 
426 |     vits__dict__ = {
427 |         'vit_large': vit_large,
428 |         'vit_base': vit_base,
429 |         'vit_small': vit_small,
430 |     }
431 | 
432 |     model_name = _make_dinov2_model_name(arch_name, patch_size)
433 |     vit_kwargs = dict(
434 |         img_size=img_size,
435 |         patch_size=patch_size,
436 |         init_values=init_values,
437 |         ffn_layer=ffn_layer,
438 |         block_chunks=block_chunks,
439 |     )
440 |     vit_kwargs.update(**kwargs)
441 |     model = vits__dict__[arch_name](**vit_kwargs)
442 | 
443 |     if pretrained:
444 |         url = _DINOV2_BASE_URL + f'/{model_name}/{model_name}_pretrain.pth'
445 |         state_dict = torch.hub.load_state_dict_from_url(
446 |             url, model_dir='./dinov2_weights/', map_location='cpu')
447 |         model.load_state_dict(state_dict, strict=False)
448 | 
449 |     return model
450 | 
451 | 
452 | def dinov2_vits14(*, pretrained: bool = True, **kwargs):
453 |     """DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M
454 |     dataset."""
455 |     return _make_dinov2_model(
456 |         arch_name='vit_small', pretrained=pretrained, **kwargs)
457 | 
458 | 
459 | def dinov2_vitb14(*, pretrained: bool = True, **kwargs):
460 |     """DINOv2 ViT-B/14 model pretrained on the LVD-142M dataset."""
461 |     return _make_dinov2_model(
462 |         arch_name='vit_base', pretrained=pretrained, **kwargs)
463 | 
464 | 
465 | def dinov2_vitl14(*, pretrained: bool = True, **kwargs):
466 |     """DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M
467 |     dataset."""
468 |     return _make_dinov2_model(
469 |         arch_name='vit_large', pretrained=pretrained, **kwargs)
470 | 
471 | 
472 | def dinov2_vitg14(*, pretrained: bool = True, **kwargs):
473 |     """DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M
474 |     dataset."""
475 |     return _make_dinov2_model(
476 |         arch_name='vit_giant2',
477 |         ffn_layer='swiglufused',
478 |         pretrained=pretrained,
479 |         **kwargs)
480 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="right">English | <a href="./README_CN.md">简体中文</a></div>
  2 | 
  3 | <div align="center">
  4 |     <img src="./docs/figs/logo.png" align="center" width="12.5%">
  5 |     <h2><strong>LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes</strong></h2>
  6 | </div>
  7 | 
  8 | <div align="center">
  9 |     <a href="https://xiangxu-0103.github.io/" target='_blank'>Xiang Xu</a><sup>*,1</sup>&nbsp;&nbsp;&nbsp;
 10 |     <a href="https://ldkong.com/" target='_blank'>Lingdong Kong</a><sup>*,2,3</sup>&nbsp;&nbsp;&nbsp;
 11 |     <a href="https://scholar.google.com/citations?user=zG3rgUcAAAAJ" target='_blank'>Hui Shuai</a><sup>4</sup>&nbsp;&nbsp;&nbsp;
 12 |     <a href="https://scholar.google.com/citations?user=lSDISOcAAAAJ" target='_blank'>Liang Pan</a><sup>3</sup>&nbsp;&nbsp;&nbsp;
 13 |     <a href="https://liuziwei7.github.io/" target='_blank'>Ziwei Liu</a><sup>5</sup>&nbsp;&nbsp;&nbsp;
 14 |     <a href="https://scholar.google.com/citations?user=2Pyf20IAAAAJ" target='_blank'>Qingshan Liu</a><sup>4</sup>
 15 |     </br></br>
 16 |     <sup>1</sup>NUAA&nbsp;&nbsp;&nbsp;
 17 |     <sup>2</sup>NUS&nbsp;&nbsp;&nbsp;
 18 |     <sup>3</sup>Shanghai AI Lab&nbsp;&nbsp;&nbsp;
 19 |     <sup>4</sup>NJUPT&nbsp;&nbsp;&nbsp;
 20 |     <sup>5</sup>S-Lab, NTU
 21 | </div>
 22 | 
 23 | <br/>
 24 | 
 25 | <div align="center">
 26 |     <a href="https://arxiv.org/abs/2501.04004" target='_blank'>
 27 |         <img src="https://img.shields.io/badge/Paper-%F0%9F%93%83-lightblue">
 28 |     </a>&nbsp;
 29 |     <a href="https://ldkong.com/LiMoE" target='_blank'>
 30 |         <img src="https://img.shields.io/badge/Project-%F0%9F%94%97-blue">
 31 |     </a>&nbsp;
 32 |     <a >
 33 |         <img src="https://img.shields.io/badge/Demo-%F0%9F%8E%AC-pink">
 34 |     </a>&nbsp;
 35 |     <a href="https://zhuanlan.zhihu.com/p/1888627256907236483" target='_blank'>
 36 |         <img src="https://img.shields.io/badge/%E4%B8%AD%E8%AF%91%E7%89%88-%F0%9F%90%BC-red">
 37 |     </a>&nbsp;
 38 |     <a href="https://hits.seeyoufarm.com">
 39 |         <img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FXiangxu-0103%2FLiMoE&count_bg=%2300B48B&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitors&edge_flat=false"/>
 40 |   </a>
 41 | </div>
 42 | 
 43 | # About
 44 | 
 45 | LiMoE is a framework that integrates the Mixture of Experts (MoE) paradigm into LiDAR data representation learning to synergistically combine multiple representations, such as range images, sparse voxels, and raw points. The framework consists of three stages: i) Image-to-LiDAR pretraining, which transfers prior knowledge from images to point clouds across different representations; ii) Contrastive Mixture Learning (CML), which uses MoE to adaptively activate relevant attributes from each representation and distills these mixed features into a unified 3D network; iii) Semantic Mixture Supervision (SMS), which combines semantic logits from multiple representations to boost downstream segmentation performance.
 46 | 
 47 | <img src="docs/figs/framework.png" align="center" width="100%">
 48 | 
 49 | ## :memo: Updates
 50 | 
 51 | - \[2025.02\] - Our paper **LiMoE** has been accepted to **CVPR 2025**! :tada:
 52 | - \[2025.01\] - Introducing the :family_man_boy_boy: **LiMoE** project! For more details, kindly refer to our [Project Page](https://ldkong.com/LiMoE) and [Preprint](https://arxiv.org/abs/2501.04004). :rocket:
 53 | 
 54 | # Table of Content
 55 | 
 56 | - [Installation](#gear-installation)
 57 | - [Data Preparation](#hotsprings-data-preparation)
 58 | - [Getting Started](#rocket-getting-started)
 59 | - [Main Results](#bar_chart-main-results)
 60 | - [License](#license)
 61 | - [Citation](#citation)
 62 | - [Acknowledgments](#acknowledgments)
 63 | 
 64 | # :gear: Installation
 65 | 
 66 | For details related to installation and environment setups, kindly refer to [INSTALL.md](./docs/INSTALL.md).
 67 | 
 68 | # :hotsprings: Data Preparation
 69 | 
 70 | Kindly refer to [DATA_PREPAER.md](./docs/DATA_PREPAER.md) for the details to prepare the datasets.
 71 | 
 72 | # :rocket: Getting Started
 73 | 
 74 | To learn more usage about this codebase, kindly refer to [GET_STARTED.md](./docs/GET_STARTED.md).
 75 | 
 76 | # :bar_chart: Main Results
 77 | 
 78 | ## Comparisons of State-of-the-Art Pretraining Methods
 79 | 
 80 | <table>
 81 |     <tr>
 82 |         <th rowspan="2">Method</th>
 83 |         <th rowspan="2">Distill</th>
 84 |         <th colspan="6">nuScenes</th>
 85 |         <th colspan="1">KITTI</th>
 86 |         <th colspan="1">Waymo</th>
 87 |     </tr>
 88 |     <tr>
 89 |         <td>LP</td>
 90 |         <td>1%</td>
 91 |         <td>5%</td>
 92 |         <td>10%</td>
 93 |         <td>25%</td>
 94 |         <td>Full</td>
 95 |         <td>1%</td>
 96 |         <td>1%</td>
 97 |     </tr>
 98 |     <tr>
 99 |         <td>Random</td>
100 |         <td>-</td>
101 |         <td>8.10</td>
102 |         <td>30.30</td>
103 |         <td>47.84</td>
104 |         <td>56.15</td>
105 |         <td>65.48</td>
106 |         <td>74.66</td>
107 |         <td>39.50</td>
108 |         <td>39.41</td>
109 |     </tr>
110 |     <tr>
111 |         <td>SLiDR</td>
112 |         <td>ViT-S</td>
113 |         <td>44.70</td>
114 |         <td>41.16</td>
115 |         <td>53.65</td>
116 |         <td>61.47</td>
117 |         <td>66.71</td>
118 |         <td>74.20</td>
119 |         <td>44.67</td>
120 |         <td>47.57</td>
121 |     </tr>
122 |     <tr>
123 |         <td>+LiMoE</td>
124 |         <td>ViT-S</td>
125 |         <td>45.80</td>
126 |         <td>46.82</td>
127 |         <td>57.54</td>
128 |         <td>63.85</td>
129 |         <td>68.61</td>
130 |         <td>75.64</td>
131 |         <td>46.81</td>
132 |         <td>48.81</td>
133 |     </tr>
134 |     <tr>
135 |         <td>Seal</td>
136 |         <td>ViT-S</td>
137 |         <td>45.16</td>
138 |         <td>44.27</td>
139 |         <td>55.13</td>
140 |         <td>62.46</td>
141 |         <td>67.64</td>
142 |         <td>75.58</td>
143 |         <td>46.51</td>
144 |         <td>48.67</td>
145 |     </tr>
146 |     <tr>
147 |         <td>SuperFlow</td>
148 |         <td>ViT-S</td>
149 |         <td>46.44</td>
150 |         <td>47.81</td>
151 |         <td>59.44</td>
152 |         <td>64.47</td>
153 |         <td>69.20</td>
154 |         <td>76.54</td>
155 |         <td>47.97</td>
156 |         <td>49.94</td>
157 |     </tr>
158 |     <tr>
159 |         <td>+LiMoE</td>
160 |         <td>ViT-S</td>
161 |         <td>48.20</td>
162 |         <td>49.60</td>
163 |         <td>60.54</td>
164 |         <td>65.65</td>
165 |         <td>71.39</td>
166 |         <td>77.27</td>
167 |         <td>49.53</td>
168 |         <td>51.42</td>
169 |     </tr>
170 |     <tr>
171 |         <td>SLiDR</td>
172 |         <td>ViT-B</td>
173 |         <td>45.35</td>
174 |         <td>41.64</td>
175 |         <td>55.83</td>
176 |         <td>62.68</td>
177 |         <td>67.61</td>
178 |         <td>74.98</td>
179 |         <td>45.50</td>
180 |         <td>48.32</td>
181 |     </tr>
182 |     <tr>
183 |         <td>+LiMoE</td>
184 |         <td>ViT-B</td>
185 |         <td>46.56</td>
186 |         <td>46.89</td>
187 |         <td>58.09</td>
188 |         <td>63.87</td>
189 |         <td>69.02</td>
190 |         <td>75.87</td>
191 |         <td>47.96</td>
192 |         <td>49.50</td>
193 |     </tr>
194 |     <tr>
195 |         <td>Seal</td>
196 |         <td>ViT-B</td>
197 |         <td>46.59</td>
198 |         <td>45.98</td>
199 |         <td>57.15</td>
200 |         <td>62.79</td>
201 |         <td>68.18</td>
202 |         <td>75.41</td>
203 |         <td>47.24</td>
204 |         <td>48.91</td>
205 |     </tr>
206 |     <tr>
207 |         <td>SuperFlow</td>
208 |         <td>ViT-S</td>
209 |         <td>47.66</td>
210 |         <td>48.09</td>
211 |         <td>59.66</td>
212 |         <td>64.52</td>
213 |         <td>69.79</td>
214 |         <td>76.57</td>
215 |         <td>48.40</td>
216 |         <td>50.20</td>
217 |     </tr>
218 |     <tr>
219 |         <td>+LiMoE</td>
220 |         <td>ViT-B</td>
221 |         <td>49.07</td>
222 |         <td>50.23</td>
223 |         <td>61.51</td>
224 |         <td>66.17</td>
225 |         <td>71.56</td>
226 |         <td>77.81</td>
227 |         <td>50.30</td>
228 |         <td>51.77</td>
229 |     </tr>
230 |     <tr>
231 |         <td>SLiDR</td>
232 |         <td>ViT-L</td>
233 |         <td>45.70</td>
234 |         <td>42.77</td>
235 |         <td>57.45</td>
236 |         <td>63.20</td>
237 |         <td>68.13</td>
238 |         <td>75.51</td>
239 |         <td>47.01</td>
240 |         <td>48.60</td>
241 |     </tr>
242 |     <tr>
243 |         <td>+LiMoE</td>
244 |         <td>ViT-L</td>
245 |         <td>47.43</td>
246 |         <td>46.92</td>
247 |         <td>58.41</td>
248 |         <td>64.54</td>
249 |         <td>69.69</td>
250 |         <td>76.32</td>
251 |         <td>48.25</td>
252 |         <td>50.23</td>
253 |     </tr>
254 |     <tr>
255 |         <td>Seal</td>
256 |         <td>ViT-L</td>
257 |         <td>46.81</td>
258 |         <td>46.27</td>
259 |         <td>58.14</td>
260 |         <td>63.27</td>
261 |         <td>68.67</td>
262 |         <td>75.66</td>
263 |         <td>47.55</td>
264 |         <td>50.02</td>
265 |     </tr>
266 |     <tr>
267 |         <td>SuperFlow</td>
268 |         <td>ViT-L</td>
269 |         <td>48.01</td>
270 |         <td>49.95</td>
271 |         <td>60.72</td>
272 |         <td>65.09</td>
273 |         <td>70.01</td>
274 |         <td>77.19</td>
275 |         <td>49.07</td>
276 |         <td>50.67</td>
277 |     </tr>
278 |     <tr>
279 |         <td>+LiMoE</td>
280 |         <td>ViT-L</td>
281 |         <td>49.35</td>
282 |         <td>51.41</td>
283 |         <td>62.07</td>
284 |         <td>66.64</td>
285 |         <td>71.59</td>
286 |         <td>77.85</td>
287 |         <td>50.69</td>
288 |         <td>51.93</td>
289 |     </tr>
290 | </table>
291 | 
292 | ## Domain Generalization Study
293 | 
294 | <table>
295 |     <tr>
296 |         <th rowspan="2">Method</th>
297 |         <th colspan="2">ScriKITTI</th>
298 |         <th colspan="2">Rellis-3D</th>
299 |         <th colspan="2">SemPOSS</th>
300 |         <th colspan="2">SemSTF</th>
301 |         <th colspan="2">SynLiDAR</th>
302 |         <th colspan="2">DAPS-3D</th>
303 |         <th colspan="2">Synth4D</th>
304 |     </tr>
305 |     <tr>
306 |         <td>1%</td>
307 |         <td>10%</td>
308 |         <td>1%</td>
309 |         <td>10%</td>
310 |         <td>Half</td>
311 |         <td>Full</td>
312 |         <td>Half</td>
313 |         <td>Full</td>
314 |         <td>1%</td>
315 |         <td>10%</td>
316 |         <td>Half</td>
317 |         <td>Full</td>
318 |         <td>1%</td>
319 |         <td>10%</td>
320 |     </tr>
321 |     <tr>
322 |         <td>Random</td>
323 |         <td>23.81</td>
324 |         <td>47.60</td>
325 |         <td>38.46</td>
326 |         <td>53.60</td>
327 |         <td>46.26</td>
328 |         <td>54.12</td>
329 |         <td>48.03</td>
330 |         <td>48.15</td>
331 |         <td>19.89</td>
332 |         <td>44.74</td>
333 |         <td>74.32</td>
334 |         <td>79.38</td>
335 |         <td>20.22</td>
336 |         <td>66.87</td>
337 |     </tr>
338 |     <tr>
339 |         <td>PPKT</td>
340 |         <td>36.50</td>
341 |         <td>51.67</td>
342 |         <td>49.71</td>
343 |         <td>54.33</td>
344 |         <td>50.18</td>
345 |         <td>56.00</td>
346 |         <td>50.92</td>
347 |         <td>54.69</td>
348 |         <td>37.57</td>
349 |         <td>46.48</td>
350 |         <td>78.90</td>
351 |         <td>84.00</td>
352 |         <td>61.10</td>
353 |         <td>62.41</td>
354 |     </tr>
355 |     <tr>
356 |         <td>SLiDR</td>
357 |         <td>39.60</td>
358 |         <td>50.45</td>
359 |         <td>49.75</td>
360 |         <td>54.57</td>
361 |         <td>51.56</td>
362 |         <td>55.36</td>
363 |         <td>52.01</td>
364 |         <td>54.35</td>
365 |         <td>42.05</td>
366 |         <td>47.84</td>
367 |         <td>81.00</td>
368 |         <td>85.40</td>
369 |         <td>63.10</td>
370 |         <td>62.67</td>
371 |     </tr>
372 |     <tr>
373 |         <td>+LiMoE</td>
374 |         <td>41.48</td>
375 |         <td>53.41</td>
376 |         <td>51.28</td>
377 |         <td>55.21</td>
378 |         <td>53.14</td>
379 |         <td>56.42</td>
380 |         <td>53.16</td>
381 |         <td>55.51</td>
382 |         <td>43.72</td>
383 |         <td>49.57</td>
384 |         <td>81.70</td>
385 |         <td>85.76</td>
386 |         <td>64.69</td>
387 |         <td>66.79</td>
388 |     </tr>
389 |     <tr>
390 |         <td>Seal</td>
391 |         <td>40.64</td>
392 |         <td>52.77</td>
393 |         <td>51.09</td>
394 |         <td>55.03</td>
395 |         <td>53.26</td>
396 |         <td>56.89</td>
397 |         <td>53.46</td>
398 |         <td>55.36</td>
399 |         <td>43.58</td>
400 |         <td>49.26</td>
401 |         <td>81.88</td>
402 |         <td>85.90</td>
403 |         <td>64.50</td>
404 |         <td>66.96</td>
405 |     </tr>
406 |     <tr>
407 |         <td>SuperFlow</td>
408 |         <td>42.70</td>
409 |         <td>54.00</td>
410 |         <td>52.83</td>
411 |         <td>55.71</td>
412 |         <td>54.41</td>
413 |         <td>57.33</td>
414 |         <td>54.72</td>
415 |         <td>56.57</td>
416 |         <td>44.85</td>
417 |         <td>51.38</td>
418 |         <td>82.43</td>
419 |         <td>86.21</td>
420 |         <td>65.31</td>
421 |         <td>69.43</td>
422 |     </tr>
423 |     <tr>
424 |         <td>+LiMoE</td>
425 |         <td>43.95</td>
426 |         <td>55.96</td>
427 |         <td>53.74</td>
428 |         <td>56.67</td>
429 |         <td>55.42</td>
430 |         <td>57.83</td>
431 |         <td>55.60</td>
432 |         <td>57.31</td>
433 |         <td>45.79</td>
434 |         <td>52.27</td>
435 |         <td>83.24</td>
436 |         <td>86.68</td>
437 |         <td>66.54</td>
438 |         <td>71.07</td>
439 |     </tr>
440 | </table>
441 | 
442 | ## Expert Activation Paths
443 | 
444 | |                                                                  ![paths](./docs/figs/paths.png)                                                                  |
445 | | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: |
446 | | Visual interpretations of the expert activation paths in Contrastive Mixture Learning (CML). The experts are #1 range view, #2 voxel, and #3 point, respectively. |
447 | 
448 | ## Point-Wise Top-1 Activation
449 | 
450 | |                                                                                                                                            ![activation1](./docs/figs/activation1.png)                                                                                                                                             |
451 | | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
452 | | Point-wise top-1 activation path in the Semantic Mixture Supervision (SMS) stage. It highlights the most activated representation for each point during the SMS stage, illustrating how different representations contribute to semantic segmentation based on spatial and object-specific characteristics. Best viewed in colors. |
453 | 
454 | ## Out-of-Distribution 3D Robustness
455 | 
456 | <table>
457 |     <tr>
458 |         <th>#</th>
459 |         <th>Method</th>
460 |         <th>mCE</th>
461 |         <th>mRR</th>
462 |         <th>Fog</th>
463 |         <th>Rain</th>
464 |         <th>Snow</th>
465 |         <th>Blur</th>
466 |         <th>Beam</th>
467 |         <th>Cross</th>
468 |         <th>Echo</th>
469 |         <th>Sensor</th>
470 |         <th>Avg</th>
471 |     </tr>
472 |     <tr>
473 |         <td rowspan="7">Full</td>
474 |         <td>Random</td>
475 |         <td>112.20</td>
476 |         <td>72.57</td>
477 |         <td>62.96</td>
478 |         <td>70.65</td>
479 |         <td>55.48</td>
480 |         <td>51.71</td>
481 |         <td>62.01</td>
482 |         <td>31.56</td>
483 |         <td>59.64</td>
484 |         <td>39.41</td>
485 |         <td>54.18</td>
486 |     </tr>
487 |     <tr>
488 |         <td>PPKT</td>
489 |         <td>105.64</td>
490 |         <td>75.87</td>
491 |         <td>64.01</td>
492 |         <td>72.18</td>
493 |         <td>59.08</td>
494 |         <td>57.17</td>
495 |         <td>63.88</td>
496 |         <td>36.34</td>
497 |         <td>60.59</td>
498 |         <td>39.57</td>
499 |         <td>56.60</td>
500 |     </tr>
501 |     <tr>
502 |         <td>SLidR</td>
503 |         <td>106.08</td>
504 |         <td>75.99</td>
505 |         <td>65.41</td>
506 |         <td>72.31</td>
507 |         <td>56.01</td>
508 |         <td>56.07</td>
509 |         <td>62.87</td>
510 |         <td>41.94</td>
511 |         <td>61.16</td>
512 |         <td>38.90</td>
513 |         <td>56.83</td>
514 |     </tr>
515 |     <tr>
516 |         <td>+LiMoE</td>
517 |         <td>101.74</td>
518 |         <td>77.77</td>
519 |         <td>67.92</td>
520 |         <td>73.25</td>
521 |         <td>57.02</td>
522 |         <td>56.30</td>
523 |         <td>64.72</td>
524 |         <td>44.81</td>
525 |         <td>61.23</td>
526 |         <td>45.37</td>
527 |         <td>58.83</td>
528 |     </tr>
529 |     <tr>
530 |         <td>Seal</td>
531 |         <td>92.63</td>
532 |         <td>83.08</td>
533 |         <td>72.66</td>
534 |         <td>74.31</td>
535 |         <td>66.22</td>
536 |         <td>66.14</td>
537 |         <td>65.96</td>
538 |         <td>57.44</td>
539 |         <td>59.87</td>
540 |         <td>39.85</td>
541 |         <td>62.81</td>
542 |     </tr>
543 |     <tr>
544 |         <td>SuperFlow</td>
545 |         <td>91.67</td>
546 |         <td>83.17</td>
547 |         <td>70.32</td>
548 |         <td>75.77</td>
549 |         <td>65.41</td>
550 |         <td>61.05</td>
551 |         <td>68.09</td>
552 |         <td>60.02</td>
553 |         <td>58.36</td>
554 |         <td>50.41</td>
555 |         <td>63.68</td>
556 |     </tr>
557 |     <tr>
558 |         <td>+LiMoE</td>
559 |         <td>88.43</td>
560 |         <td>83.28</td>
561 |         <td>71.10</td>
562 |         <td>75.92</td>
563 |         <td>65.66</td>
564 |         <td>63.86</td>
565 |         <td>68.52</td>
566 |         <td>60.78</td>
567 |         <td>61.91</td>
568 |         <td>50.66</td>
569 |         <td>64.80</td>
570 |     </tr>
571 |     <tr>
572 |         <td rowspan="6">LP</td>
573 |         <td>PPKT</td>
574 |         <td>183.44</td>
575 |         <td>78.15</td>
576 |         <td>30.65</td>
577 |         <td>35.42</td>
578 |         <td>28.12</td>
579 |         <td>29.21</td>
580 |         <td>32.82</td>
581 |         <td>19.52</td>
582 |         <td>28.01</td>
583 |         <td>20.71</td>
584 |         <td>28.06</td>
585 |     </tr>
586 |     <tr>
587 |         <td>SLidR</td>
588 |         <td>179.38</td>
589 |         <td>77.18</td>
590 |         <td>34.88</td>
591 |         <td>38.09</td>
592 |         <td>32.64</td>
593 |         <td>26.44</td>
594 |         <td>33.73</td>
595 |         <td>20.81</td>
596 |         <td>31.54</td>
597 |         <td>21.44</td>
598 |         <td>29.95</td>
599 |     </tr>
600 |     <tr>
601 |         <td>+LiMoE</td>
602 |         <td>163.75</td>
603 |         <td>75.49</td>
604 |         <td>37.29</td>
605 |         <td>43.41</td>
606 |         <td>36.04</td>
607 |         <td>38.33</td>
608 |         <td>40.66</td>
609 |         <td>22.46</td>
610 |         <td>37.61</td>
611 |         <td>25.38</td>
612 |         <td>35.15</td>
613 |     </tr>
614 |     <tr>
615 |         <td>Seal</td>
616 |         <td>166.18</td>
617 |         <td>75.38</td>
618 |         <td>37.33</td>
619 |         <td>42.77</td>
620 |         <td>29.93</td>
621 |         <td>37.73</td>
622 |         <td>40.32</td>
623 |         <td>20.31</td>
624 |         <td>37.73</td>
625 |         <td>24.94</td>
626 |         <td>33.88</td>
627 |     </tr>
628 |     <tr>
629 |         <td>SuperFlow</td>
630 |         <td>161.78</td>
631 |         <td>75.52</td>
632 |         <td>37.59</td>
633 |         <td>43.42</td>
634 |         <td>37.60</td>
635 |         <td>39.57</td>
636 |         <td>41.40</td>
637 |         <td>23.64</td>
638 |         <td>38.03</td>
639 |         <td>26.69</td>
640 |         <td>35.99</td>
641 |     </tr>
642 |     <tr>
643 |         <td>+LiMoE</td>
644 |         <td>155.77</td>
645 |         <td>78.23</td>
646 |         <td>40.35</td>
647 |         <td>45.28</td>
648 |         <td>39.14</td>
649 |         <td>42.10</td>
650 |         <td>44.21</td>
651 |         <td>27.33</td>
652 |         <td>39.20</td>
653 |         <td>29.49</td>
654 |         <td>38.39</td>
655 |     </tr>
656 | </table>
657 | 
658 | ## Cosine Similarity
659 | 
660 | |                                                                                                                 ![heatmaps](./docs/figs/heatmaps.png)                                                                                                                 |
661 | | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
662 | | Cosine similarity between learned features of a query point (denoted as the red dot) and: (1) the features of the image of the same scene (the first row); and (2) the features of the LiDAR points projected onto the image (the second row). Best viewed in colors. |
663 | 
664 | ## Qualitative Assessment
665 | 
666 | |                                                                                                       ![qualitative1](./docs/figs/qualitative1.png)                                                                                                        |
667 | | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
668 | | Qualitative assessments of state-of-the-art pretraining methods, pretrained on nuScenes and fine-tuned on SemanticKITTI with 1% annotations. The error maps depict correct and incorrect predictions in gray and red, respectively. Best viewed in colors. |
669 | 
670 | # License
671 | 
672 | This work is under the [Apache License Version 2.0](https://www.apache.org/licenses/LICENSE-2.0), while some specific implementations in this codebase might be with other licenses.
673 | 
674 | Kindly refer to [LICENSE.md](./docs/LICENSE.md) for a more careful check, if you are using our code for commercial matters.
675 | 
676 | # Citation
677 | 
678 | If you find this work helpful for your research, please kindly consider citing our paper:
679 | 
680 | ```bibtex
681 | @inproceedings{xu2025limoe,
682 |     title = {LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes},
683 |     author = {Xu, Xiang and Kong, Lingdong and Shuai, Hui and Pan, Liang and Liu, Ziwei and Liu, Qingshan},
684 |     booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
685 |     pages = {27368--27379},
686 |     year = {2025}
687 | }
688 | ```
689 | 
690 | # Acknowledgments
691 | 
692 | This work is developed based on the [MMDetection3D](https://github.com/open-mmlab/mmdetection3d) codebase.
693 | 
694 | > <img src="https://github.com/open-mmlab/mmdetection3d/blob/main/resources/mmdet3d-logo.png" width="30%"/><br>
695 | > MMDetection3D is an open-source object detection toolbox based on PyTorch, towards the next-generation platform for general 3D perception. It is a part of the OpenMMLab project developed by MMLab.
696 | 
697 | We acknowledge the use of the following public resources during the couuse of this work: <sup>1</sup>[nuScenes](https://www.nuscenes.org/nuscenes), <sup>2</sup>[nuScenes-devkit](https://github.com/nutonomy/nuscenes-devkit), <sup>3</sup>[SemanticKITTI](http://www.semantic-kitti.org), <sup>4</sup>[SemanticKITTI-API](https://github.com/PRBonn/semantic-kitti-api), , <sup>5</sup>[WaymoOpenDataset](https://waymo.com/open), <sup>6</sup>[Synth4D](https://github.com/saltoricristiano/gipso-sfouda), <sup>7</sup>[ScribbleKITTI](https://github.com/ouenal/scribblekitti), <sup>8</sup>[RELLIS-3D](https://github.com/unmannedlab/RELLIS-3D), <sup>9</sup>[SemanticPOSS](http://www.poss.pku.edu.cn/semanticposs.html), <sup>10</sup>[SemanticSTF](https://github.com/xiaoaoran/SemanticSTF), <sup>11</sup>[SynthLiDAR](https://github.com/xiaoaoran/SynLiDAR), <sup>12</sup>[DAPS-3D](https://github.com/subake/DAPS3D), <sup>13</sup>[Robo3D](https://github.com/ldkong1205/Robo3D), <sup>14</sup>[SLidR](https://github.com/valeoai/SLidR), <sup>15</sup>[DINOv2](https://github.com/facebookresearch/dinov2), <sup>16</sup>[FRNet](https://github.com/Xiangxu-0103/FRNet), <sup>17</sup>[SuperFlow](https://github.com/Xiangxu-0103/SuperFlow), <sup>18</sup>[torchsparse](https://github.com/mit-han-lab/torchsparse), <sup>19</sup>[Conv-LoRA](https://github.com/autogluon/autogluon), <sup>20</sup>[MoE-LLaVA](https://github.com/PKU-YuanGroup/MoE-LLaVA). :heart_decoration:
698 | 


--------------------------------------------------------------------------------