├── limoe ├── __init__.py ├── models │ ├── backbones │ │ ├── dinov2 │ │ │ ├── __init__.py │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── layer_scale.py │ │ │ │ ├── drop_path.py │ │ │ │ ├── mlp.py │ │ │ │ ├── swiglu_ffn.py │ │ │ │ ├── dino_head.py │ │ │ │ ├── attention.py │ │ │ │ ├── patch_embed.py │ │ │ │ └── block.py │ │ │ └── dinov2_vision_transformer.py │ │ ├── __init__.py │ │ ├── vit.py │ │ ├── spvcnn.py │ │ └── minkunet.py │ ├── selfsup │ │ ├── __init__.py │ │ └── slidr.py │ ├── data_preprocessors │ │ ├── __init__.py │ │ └── data_preprocessor.py │ ├── seg_heads │ │ ├── __init__.py │ │ ├── upsample_head.py │ │ └── linear_head.py │ └── __init__.py └── datasets │ ├── __init__.py │ ├── transforms │ ├── __init__.py │ ├── formatting.py │ ├── loading.py │ └── transforms.py │ └── nuscenes_dataset.py ├── docs ├── figs │ ├── logo.png │ ├── paths.png │ ├── heatmaps.png │ ├── teaser.png │ ├── framework.png │ ├── activation1.png │ ├── activation2.png │ ├── qualitative1.png │ └── qualitative2.png ├── DATA_PREPAER.md ├── GET_STARTED.md └── INSTALL.md ├── .gitignore ├── configs ├── slidr │ ├── slidr_minkunet.py │ └── slidr_spvcnn.py └── _base_ │ ├── schedules │ └── pretrain.py │ ├── default_runtime.py │ ├── models │ ├── slidr_spvcnn.py │ └── slidr_minkunet.py │ └── datasets │ └── nuscenes_pretrain.py ├── setup.cfg ├── dist_train.sh ├── .pre-commit-config.yaml ├── train.py ├── LICENSE └── README.md /limoe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/figs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/logo.png -------------------------------------------------------------------------------- /docs/figs/paths.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/paths.png -------------------------------------------------------------------------------- /limoe/models/selfsup/__init__.py: -------------------------------------------------------------------------------- 1 | from .slidr import SLidR 2 | 3 | __all__ = ['SLidR'] 4 | -------------------------------------------------------------------------------- /docs/figs/heatmaps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/heatmaps.png -------------------------------------------------------------------------------- /docs/figs/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/teaser.png -------------------------------------------------------------------------------- /docs/figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/framework.png -------------------------------------------------------------------------------- /docs/figs/activation1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/activation1.png -------------------------------------------------------------------------------- /docs/figs/activation2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/activation2.png -------------------------------------------------------------------------------- /docs/figs/qualitative1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/qualitative1.png -------------------------------------------------------------------------------- /docs/figs/qualitative2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xiangxu-0103/LiMoE/HEAD/docs/figs/qualitative2.png -------------------------------------------------------------------------------- /limoe/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .nuscenes_dataset import NuScenesSegDataset 2 | 3 | __all__ = ['NuScenesSegDataset'] 4 | -------------------------------------------------------------------------------- /limoe/models/data_preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_preprocessor import LiMoEDataPreprocessor 2 | 3 | __all__ = ['LiMoEDataPreprocessor'] 4 | -------------------------------------------------------------------------------- /limoe/models/seg_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .linear_head import LinearHead 2 | from .upsample_head import UpsampleHead 3 | 4 | __all__ = ['UpsampleHead', 'LinearHead'] 5 | -------------------------------------------------------------------------------- /limoe/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .minkunet import MinkUNetBackbone 2 | from .spvcnn import SPVCNNBackbone 3 | from .vit import ViT 4 | 5 | __all__ = ['ViT', 'MinkUNetBackbone', 'SPVCNNBackbone'] 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | 4 | # Pytorch 5 | *.pt 6 | *.pkl 7 | 8 | data/ 9 | output/ 10 | logs/ 11 | work_dirs/ 12 | dinov2_weights/ 13 | 14 | *.DS_Store 15 | -------------------------------------------------------------------------------- /configs/slidr/slidr_minkunet.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../_base_/datasets/nuscenes_pretrain.py', 3 | '../_base_/models/slidr_minkunet.py', '../_base_/schedules/pretrain.py', 4 | '../_base_/default_runtime.py' 5 | ] 6 | -------------------------------------------------------------------------------- /configs/slidr/slidr_spvcnn.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../_base_/datasets/nuscenes_pretrain.py', 3 | '../_base_/models/slidr_spvcnn.py', '../_base_/schedules/pretrain.py', 4 | '../_base_/default_runtime.py' 5 | ] 6 | -------------------------------------------------------------------------------- /limoe/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * # noqa: F401,F403 2 | from .data_preprocessors import * # noqa: F401,F403 3 | from .seg_heads import * # noqa: F401,F403 4 | from .selfsup import * # noqa: F401,F403 5 | -------------------------------------------------------------------------------- /limoe/datasets/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .formatting import LiMoEInputs 2 | from .loading import LoadMultiModalityData 3 | from .transforms import FlipHorizontal, ResizedCrop 4 | 5 | __all__ = [ 6 | 'LoadMultiModalityData', 'ResizedCrop', 'FlipHorizontal', 'LiMoEInputs' 7 | ] 8 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .attention import MemEffAttention 2 | from .block import NestedTensorBlock 3 | from .dino_head import DINOHead 4 | from .mlp import Mlp 5 | from .patch_embed import PatchEmbed 6 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 7 | 8 | __all__ = [ 9 | 'DINOHead', 'Mlp', 'PatchEmbed', 'SwiGLUFFN', 'SwiGLUFFNFused', 10 | 'NestedTensorBlock', 'MemEffAttention' 11 | ] 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [yapf] 2 | BASED_ON_STYLE = pep8 3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 5 | 6 | [isort] 7 | line_length = 79 8 | multi_line_output = 0 9 | extra_standard_library = setuptools 10 | known_third_party = mmcv,mmengine,mmdet,mmdet3d,numpy,nuscenes 11 | no_lines_before = STDLIB,LOCALFOLDER 12 | default_section = THIRDPARTY 13 | 14 | [codespell] 15 | ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD,warmup 16 | -------------------------------------------------------------------------------- /dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --launcher pytorch ${@:3} 20 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import Tensor, nn 5 | 6 | 7 | class LayerScale(nn.Module): 8 | 9 | def __init__( 10 | self, 11 | dim: int, 12 | init_values: Union[float, Tensor] = 1e-5, 13 | inplace: bool = False, 14 | ) -> None: 15 | super().__init__() 16 | self.inplace = inplace 17 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 18 | 19 | def forward(self, x: Tensor) -> Tensor: 20 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 21 | -------------------------------------------------------------------------------- /configs/_base_/schedules/pretrain.py: -------------------------------------------------------------------------------- 1 | lr = 0.01 2 | optim_wrapper = dict( 3 | type='OptimWrapper', 4 | optimizer=dict( 5 | type='AdamW', lr=lr, betas=(0.9, 0.999), weight_decay=0.01, eps=1e-6)) 6 | 7 | param_scheduler = [ 8 | dict( 9 | type='OneCycleLR', 10 | total_steps=100, 11 | by_epoch=True, 12 | eta_max=lr, 13 | pct_start=0.2, 14 | div_factor=25.0, 15 | final_div_factor=100.0, 16 | convert_to_iter_based=True) 17 | ] 18 | train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=100) 19 | 20 | auto_scale_lr = dict(enable=False, base_batch_size=32) 21 | -------------------------------------------------------------------------------- /docs/DATA_PREPAER.md: -------------------------------------------------------------------------------- 1 | # Data Preparation 2 | 3 | ## Overall Structure 4 | 5 | ``` 6 | LiMoE 7 | ├── data 8 | │ ├── nuscenes 9 | │ │ ├── lidarseg 10 | │ │ ├── maps 11 | │ │ ├── samples 12 | │ │ ├── sweeps 13 | │ │ ├── v1.0-test 14 | | | ├── v1.0-trainval 15 | | | ├── superflow_nus_info.pkl 16 | | | ├── downstream_nuscenes_infos_train_1.pkl 17 | | | ├── downstream_nuscenes_infos_train_5.pkl 18 | | | ├── downstream_nuscenes_infos_train_10.pkl 19 | | | ├── downstream_nuscenes_infos_train_25.pkl 20 | | | ├── downstream_nuscenes_infos_train_100.pkl 21 | | | ├── downstream_nuscenes_infos_val.pkl 22 | │ ├── openseed_inst17 23 | ``` 24 | 25 | The `pkl` files and `superpixels` can be downloaded from [huggingface](https://huggingface.co/datasets/Xiangxu-0103/SuperFlow_SuperPixel). 26 | -------------------------------------------------------------------------------- /configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | default_scope = 'mmdet3d' 2 | 3 | custom_imports = dict( 4 | imports=['limoe.datasets', 'limoe.datasets.transforms', 'limoe.models'], 5 | allow_failed_imports=False) 6 | 7 | default_hooks = dict( 8 | timer=dict(type='IterTimerHook'), 9 | logger=dict(type='LoggerHook', interval=50), 10 | param_scheduler=dict(type='ParamSchedulerHook'), 11 | checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1), 12 | sampler_seed=dict(type='DistSamplerSeedHook'), 13 | visualization=dict(type='Det3DVisualizationHook')) 14 | 15 | env_cfg = dict( 16 | cudnn_benchmark=False, 17 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 18 | dist_cfg=dict(backend='nccl')) 19 | 20 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) 21 | 22 | log_level = 'INFO' 23 | load_from = None 24 | resume = False 25 | -------------------------------------------------------------------------------- /docs/GET_STARTED.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Before training, you may encounter an error `MMCV=={xxx} is used but incompatible. Please install mmcv>={xxx}, <{xxx}`. We suggest to modify `__init__.py` under `mmdet` and `mmdet3d` package as follows: 4 | 5 | ```python 6 | mmcv_maximum_version = '3.0.0' 7 | ``` 8 | 9 | Meanwhile, you should modify `Line 123-124` in `mmdet3d/datasets/seg3d_dataset.py` as follows: 10 | 11 | ```python 12 | if scene_idxs is not None: 13 | self.scene_idxs = self.get_scene_idxs(scene_idxs) 14 | self.data_list = [self.data_list[i] for i in self.scene_idxs] 15 | ``` 16 | 17 | ## Train with a single GPU 18 | 19 | ```bash 20 | python train.py ${CONFIG_FILE} 21 | ``` 22 | 23 | ## Train with multiple GPUs 24 | 25 | ```bash 26 | bash dist_train.sh ${CONFIG_FILE} ${GPU_NUM} 27 | ``` 28 | 29 | **Note**: For pretraining phase, we suggest to use 8 GPUs while 4 GPUs for downstream tasks. 30 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 5 | if drop_prob == 0.0 or not training: 6 | return x 7 | keep_prob = 1 - drop_prob 8 | shape = (x.shape[0], ) + (1, ) * ( 9 | x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 10 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 11 | if keep_prob > 0.0: 12 | random_tensor.div_(keep_prob) 13 | output = x * random_tensor 14 | return output 15 | 16 | 17 | class DropPath(nn.Module): 18 | """Drop paths (Stochastic Depth) per sample (when applied in main path of 19 | residual blocks).""" 20 | 21 | def __init__(self, drop_prob=None): 22 | super(DropPath, self).__init__() 23 | self.drop_prob = drop_prob 24 | 25 | def forward(self, x): 26 | return drop_path(x, self.drop_prob, self.training) 27 | -------------------------------------------------------------------------------- /limoe/models/seg_heads/upsample_head.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from mmdet3d.registry import MODELS 3 | from mmdet3d.utils import OptMultiConfig 4 | from mmengine.model import BaseModule 5 | from torch import Tensor 6 | 7 | 8 | @MODELS.register_module() 9 | class UpsampleHead(BaseModule): 10 | 11 | def __init__(self, 12 | in_channels: int, 13 | out_channels: int, 14 | scale_factor: int, 15 | mode: str = 'bilinear', 16 | align_corners: bool = True, 17 | init_cfg: OptMultiConfig = None) -> None: 18 | super(UpsampleHead, self).__init__(init_cfg=init_cfg) 19 | self.head = nn.Sequential( 20 | nn.Conv2d(in_channels, out_channels, kernel_size=1), 21 | nn.Upsample( 22 | scale_factor=scale_factor, 23 | mode=mode, 24 | align_corners=align_corners)) 25 | 26 | def forward(self, x: Tensor) -> Tensor: 27 | return self.head(x) 28 | -------------------------------------------------------------------------------- /configs/_base_/models/slidr_spvcnn.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='SLidR', 3 | data_preprocessor=dict( 4 | type='LiMoEDataPreprocessor', 5 | H=32, 6 | W=480, 7 | fov_up=10.0, 8 | fov_down=-30.0, 9 | ignore_index=16, 10 | voxel_size=[0.1, 1, 0.1], 11 | voxel_type='cylinder', 12 | mean=[0.485, 0.456, 0.406], 13 | std=[0.229, 0.224, 0.225]), 14 | backbone_3d=dict( 15 | type='SPVCNNBackbone', 16 | in_channels=4, 17 | base_channels=32, 18 | layers=[2, 3, 4, 6, 2, 2, 2, 2], 19 | planes=[32, 64, 128, 256, 256, 128, 96, 96], 20 | block_type='basic', 21 | bn_momentum=0.05), 22 | head_3d=dict( 23 | type='LinearHead', channels=96, num_classes=64, dropout_ratio=0), 24 | backbone_2d=dict(type='ViT', images_encoder='dinov2_vit_base_p14'), 25 | head_2d=dict( 26 | type='UpsampleHead', in_channels=768, out_channels=64, 27 | scale_factor=14), 28 | superpixel_size=150, 29 | temperature=0.07) 30 | -------------------------------------------------------------------------------- /configs/_base_/models/slidr_minkunet.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='SLidR', 3 | data_preprocessor=dict( 4 | type='LiMoEDataPreprocessor', 5 | H=32, 6 | W=480, 7 | fov_up=10.0, 8 | fov_down=-30.0, 9 | ignore_index=16, 10 | voxel_size=[0.1, 1, 0.1], 11 | voxel_type='cylinder', 12 | mean=[0.485, 0.456, 0.406], 13 | std=[0.229, 0.224, 0.225]), 14 | backbone_3d=dict( 15 | type='MinkUNetBackbone', 16 | in_channels=4, 17 | base_channels=32, 18 | layers=[2, 3, 4, 6, 2, 2, 2, 2], 19 | planes=[32, 64, 128, 256, 256, 128, 96, 96], 20 | block_type='basic', 21 | bn_momentum=0.05), 22 | head_3d=dict( 23 | type='LinearHead', channels=96, num_classes=64, dropout_ratio=0), 24 | backbone_2d=dict(type='ViT', images_encoder='dinov2_vit_base_p14'), 25 | head_2d=dict( 26 | type='UpsampleHead', in_channels=768, out_channels=64, 27 | scale_factor=14), 28 | superpixel_size=150, 29 | temperature=0.07) 30 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/mlp.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | 3 | from torch import Tensor, nn 4 | 5 | 6 | class Mlp(nn.Module): 7 | 8 | def __init__( 9 | self, 10 | in_features: int, 11 | hidden_features: Optional[int] = None, 12 | out_features: Optional[int] = None, 13 | act_layer: Callable[..., nn.Module] = nn.GELU, 14 | drop: float = 0.0, 15 | bias: bool = True, 16 | ) -> None: 17 | super().__init__() 18 | out_features = out_features or in_features 19 | hidden_features = hidden_features or in_features 20 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 21 | self.act = act_layer() 22 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 23 | self.drop = nn.Dropout(drop) 24 | 25 | def forward(self, x: Tensor) -> Tensor: 26 | x = self.fc1(x) 27 | x = self.act(x) 28 | x = self.drop(x) 29 | x = self.fc2(x) 30 | x = self.drop(x) 31 | return x 32 | -------------------------------------------------------------------------------- /docs/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Prerequisties 4 | 5 | This codebase is tested with `torch==1.12.1`, `mmengine==0.10.4`, `mmcv==2.2.0`, `mmdet==3.3.0`, and `mmdet3d==1.4.0`, with `CUDA 11.3`. 6 | 7 | **Step 1.** Create a conda environment and activate it. 8 | 9 | ```bash 10 | conda create --name limoe python==3.8 -y 11 | conda activate limoe 12 | ``` 13 | 14 | **Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/). 15 | 16 | ```bash 17 | conda install pytorch torchvision -c pytorch 18 | ``` 19 | 20 | **Step 3.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv), [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d) using [MIM](https://github.com/open-mmlab/mim). 21 | 22 | ```bash 23 | pip install -U openmim 24 | mim install mmengine 25 | mim install mmcv 26 | mim install mmdet 27 | mim install mmdet3d 28 | ``` 29 | 30 | Optionally, you can also install the above projects from the source, e.g.: 31 | 32 | ```bash 33 | git clone https://github.com/open-mmlab/mmdetection3d 34 | cd mmdetection3d 35 | pip install -v -e . 36 | ``` 37 | 38 | Meanwhile, you also need to install [`nuScenes-devkit`](https://github.com/nutonomy/nuscenes-devkit). 39 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/flake8 3 | rev: 5.0.4 4 | hooks: 5 | - id: flake8 6 | - repo: https://github.com/PyCQA/isort 7 | rev: 5.11.5 8 | hooks: 9 | - id: isort 10 | - repo: https://github.com/pre-commit/mirrors-yapf 11 | rev: v0.32.0 12 | hooks: 13 | - id: yapf 14 | - repo: https://github.com/pre-commit/pre-commit-hooks 15 | rev: v4.3.0 16 | hooks: 17 | - id: trailing-whitespace 18 | - id: check-yaml 19 | - id: end-of-file-fixer 20 | - id: requirements-txt-fixer 21 | - id: double-quote-string-fixer 22 | - id: check-merge-conflict 23 | - id: fix-encoding-pragma 24 | args: ["--remove"] 25 | - id: mixed-line-ending 26 | args: ["--fix=lf"] 27 | - repo: https://github.com/codespell-project/codespell 28 | rev: v2.2.1 29 | hooks: 30 | - id: codespell 31 | - repo: https://github.com/executablebooks/mdformat 32 | rev: 0.7.9 33 | hooks: 34 | - id: mdformat 35 | args: [ "--number" ] 36 | additional_dependencies: 37 | - mdformat-openmmlab 38 | - mdformat_frontmatter 39 | - linkify-it-py 40 | - repo: https://github.com/myint/docformatter 41 | rev: v1.3.1 42 | hooks: 43 | - id: docformatter 44 | args: ["--in-place", "--wrap-descriptions", "79"] 45 | -------------------------------------------------------------------------------- /limoe/datasets/transforms/formatting.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | import torch 4 | from mmcv.transforms import BaseTransform 5 | from mmdet3d.registry import TRANSFORMS 6 | from mmdet3d.structures import Det3DDataSample, PointData 7 | from mmdet3d.structures.points import BasePoints 8 | 9 | 10 | @TRANSFORMS.register_module() 11 | class LiMoEInputs(BaseTransform): 12 | 13 | def __init__(self, keys: Sequence[str] = None): 14 | self.keys = keys 15 | 16 | def transform(self, results: dict) -> dict: 17 | if 'points' in results: 18 | if isinstance(results['points'], BasePoints): 19 | results['points'] = results['points'].tensor 20 | 21 | if 'pairing_points' in results: 22 | results['pairing_points'] = torch.tensor(results['pairing_points']) 23 | 24 | if 'pairing_images' in results: 25 | results['pairing_images'] = torch.tensor(results['pairing_images']) 26 | 27 | data_sample = Det3DDataSample() 28 | gt_pts_seg = PointData() 29 | 30 | inputs = {} 31 | for key in self.keys: 32 | if key in ('points', 'imgs'): 33 | inputs[key] = results[key] 34 | elif key in ('pairing_points', 'pairing_images', 'superpixels', 35 | 'pts_semantic_mask'): 36 | gt_pts_seg[key] = results[key] 37 | 38 | data_sample.gt_pts_seg = gt_pts_seg 39 | 40 | packed_results = dict() 41 | packed_results['data_samples'] = data_sample 42 | packed_results['inputs'] = inputs 43 | 44 | return packed_results 45 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | 3 | import torch.nn.functional as F 4 | from torch import Tensor, nn 5 | 6 | 7 | class SwiGLUFFN(nn.Module): 8 | 9 | def __init__( 10 | self, 11 | in_features: int, 12 | hidden_features: Optional[int] = None, 13 | out_features: Optional[int] = None, 14 | act_layer: Callable[..., nn.Module] = None, 15 | drop: float = 0.0, 16 | bias: bool = True, 17 | ) -> None: 18 | super().__init__() 19 | out_features = out_features or in_features 20 | hidden_features = hidden_features or in_features 21 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 22 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 23 | 24 | def forward(self, x: Tensor) -> Tensor: 25 | x12 = self.w12(x) 26 | x1, x2 = x12.chunk(2, dim=-1) 27 | hidden = F.silu(x1) * x2 28 | return self.w3(hidden) 29 | 30 | 31 | try: 32 | from xformers.ops import SwiGLU 33 | 34 | XFORMERS_AVAILABLE = True 35 | except ImportError: 36 | SwiGLU = SwiGLUFFN 37 | XFORMERS_AVAILABLE = False 38 | 39 | 40 | class SwiGLUFFNFused(SwiGLU): 41 | 42 | def __init__( 43 | self, 44 | in_features: int, 45 | hidden_features: Optional[int] = None, 46 | out_features: Optional[int] = None, 47 | act_layer: Callable[..., nn.Module] = None, 48 | drop: float = 0.0, 49 | bias: bool = True, 50 | ) -> None: 51 | out_features = out_features or in_features 52 | hidden_features = hidden_features or in_features 53 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 54 | super().__init__( 55 | in_features=in_features, 56 | hidden_features=hidden_features, 57 | out_features=out_features, 58 | bias=bias, 59 | ) 60 | -------------------------------------------------------------------------------- /limoe/models/backbones/vit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from mmdet3d.registry import MODELS 4 | from torch import Tensor 5 | 6 | from .dinov2 import dinov2_vision_transformer as dinov2_vit 7 | 8 | DINOv2_MODELS = { 9 | 'dinov2_vit_small_p14': ('dinov2_vits14', 14, 384), 10 | 'dinov2_vit_base_p14': ('dinov2_vitb14', 14, 768), 11 | 'dinov2_vit_large_p14': ('dinov2_vitl14', 14, 1024) 12 | } 13 | 14 | 15 | @MODELS.register_module() 16 | class ViT(nn.Module): 17 | 18 | def __init__(self, 19 | images_encoder: str, 20 | feat: str = 'x_pre_norm', 21 | height: int = 224, 22 | width: int = 448) -> None: 23 | super(ViT, self).__init__() 24 | 25 | # ViT parameters 26 | model_name, patch_size, embed_dim = DINOv2_MODELS.get(images_encoder) 27 | self.patch_size = patch_size 28 | self.embed_dim = embed_dim 29 | self.which_feature = feat 30 | 31 | # Compute feature size 32 | assert (height % self.patch_size) == 0 33 | assert (width % self.patch_size) == 0 34 | self.f_height = height // self.patch_size 35 | self.f_width = width // self.patch_size 36 | 37 | # Load ViT 38 | self.encoder = dinov2_vit.__dict__[model_name]( 39 | patch_size=patch_size, pretrained=True) 40 | 41 | # Teacher must stay frozen 42 | for param in self.encoder.parameters(): 43 | param.requires_grad = False 44 | self.encoder.eval() 45 | 46 | def forward(self, x: Tensor) -> Tensor: 47 | 48 | # Go through frozen encoder 49 | with torch.no_grad(): 50 | batch_size = x.shape[0] 51 | 52 | output = self.encoder.forward_get_last_n(x) 53 | feat = output[self.which_feature] 54 | x = torch.cat(feat, dim=2) 55 | 56 | # Remove the CLS token and reshape the patch token features. 57 | x = ( 58 | x[:, 1:, :].transpose(1, 2).view(batch_size, self.embed_dim, 59 | self.f_height, self.f_width)) 60 | 61 | return x 62 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/dino_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.init import trunc_normal_ 4 | from torch.nn.utils import weight_norm 5 | 6 | 7 | class DINOHead(nn.Module): 8 | 9 | def __init__( 10 | self, 11 | in_dim, 12 | out_dim, 13 | use_bn=False, 14 | nlayers=3, 15 | hidden_dim=2048, 16 | bottleneck_dim=256, 17 | mlp_bias=True, 18 | ): 19 | super().__init__() 20 | nlayers = max(nlayers, 1) 21 | self.mlp = _build_mlp( 22 | nlayers, 23 | in_dim, 24 | bottleneck_dim, 25 | hidden_dim=hidden_dim, 26 | use_bn=use_bn, 27 | bias=mlp_bias, 28 | ) 29 | self.apply(self._init_weights) 30 | self.last_layer = weight_norm( 31 | nn.Linear(bottleneck_dim, out_dim, bias=False)) 32 | self.last_layer.weight_g.data.fill_(1) 33 | 34 | def _init_weights(self, m): 35 | if isinstance(m, nn.Linear): 36 | trunc_normal_(m.weight, std=0.02) 37 | if isinstance(m, nn.Linear) and m.bias is not None: 38 | nn.init.constant_(m.bias, 0) 39 | 40 | def forward(self, x): 41 | x = self.mlp(x) 42 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 43 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 44 | x = self.last_layer(x) 45 | return x 46 | 47 | 48 | def _build_mlp(nlayers, 49 | in_dim, 50 | bottleneck_dim, 51 | hidden_dim=None, 52 | use_bn=False, 53 | bias=True): 54 | if nlayers == 1: 55 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 56 | else: 57 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 58 | if use_bn: 59 | layers.append(nn.BatchNorm1d(hidden_dim)) 60 | layers.append(nn.GELU()) 61 | for _ in range(nlayers - 2): 62 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 63 | if use_bn: 64 | layers.append(nn.BatchNorm1d(hidden_dim)) 65 | layers.append(nn.GELU()) 66 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 67 | return nn.Sequential(*layers) 68 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/attention.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from torch import Tensor, nn 4 | 5 | logger = logging.getLogger('dinov2') 6 | 7 | try: 8 | from xformers.ops import memory_efficient_attention, unbind 9 | 10 | XFORMERS_AVAILABLE = True 11 | except ImportError: 12 | logger.warning('xFormers not available') 13 | XFORMERS_AVAILABLE = False 14 | 15 | 16 | class Attention(nn.Module): 17 | 18 | def __init__( 19 | self, 20 | dim: int, 21 | num_heads: int = 8, 22 | qkv_bias: bool = False, 23 | proj_bias: bool = True, 24 | attn_drop: float = 0.0, 25 | proj_drop: float = 0.0, 26 | ) -> None: 27 | super().__init__() 28 | self.num_heads = num_heads 29 | head_dim = dim // num_heads 30 | self.scale = head_dim**-0.5 31 | 32 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 33 | self.attn_drop = nn.Dropout(attn_drop) 34 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 35 | self.proj_drop = nn.Dropout(proj_drop) 36 | 37 | def forward(self, x: Tensor) -> Tensor: 38 | B, N, C = x.shape 39 | qkv = ( 40 | self.qkv(x).reshape(B, N, 3, self.num_heads, 41 | C // self.num_heads).permute(2, 0, 3, 1, 4)) 42 | 43 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 44 | attn = q @ k.transpose(-2, -1) 45 | 46 | attn = attn.softmax(dim=-1) 47 | attn = self.attn_drop(attn) 48 | 49 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 50 | x = self.proj(x) 51 | x = self.proj_drop(x) 52 | return x 53 | 54 | 55 | class MemEffAttention(Attention): 56 | 57 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 58 | if not XFORMERS_AVAILABLE: 59 | assert attn_bias is None, \ 60 | 'xFormers is required for nested tensors usage' 61 | return super().forward(x) 62 | 63 | B, N, C = x.shape 64 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 65 | 66 | q, k, v = unbind(qkv, 2) 67 | 68 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 69 | x = x.reshape([B, N, C]) 70 | 71 | x = self.proj(x) 72 | x = self.proj_drop(x) 73 | return x 74 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional, Tuple, Union 2 | 3 | import torch.nn as nn 4 | from torch import Tensor 5 | 6 | 7 | def make_2tuple(x): 8 | if isinstance(x, tuple): 9 | assert len(x) == 2 10 | return x 11 | 12 | assert isinstance(x, int) 13 | return (x, x) 14 | 15 | 16 | class PatchEmbed(nn.Module): 17 | """2D image to patch embedding: (B,C,H,W) -> (B,N,D) 18 | 19 | Args: 20 | img_size: Image size. 21 | patch_size: Patch token size. 22 | in_chans: Number of input image channels. 23 | embed_dim: Number of linear projection output channels. 24 | norm_layer: Normalization layer. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | img_size: Union[int, Tuple[int, int]] = 224, 30 | patch_size: Union[int, Tuple[int, int]] = 16, 31 | in_chans: int = 3, 32 | embed_dim: int = 768, 33 | norm_layer: Optional[Callable] = None, 34 | flatten_embedding: bool = True, 35 | ) -> None: 36 | super().__init__() 37 | 38 | image_HW = make_2tuple(img_size) 39 | patch_HW = make_2tuple(patch_size) 40 | patch_grid_size = ( 41 | image_HW[0] // patch_HW[0], 42 | image_HW[1] // patch_HW[1], 43 | ) 44 | 45 | self.img_size = image_HW 46 | self.patch_size = patch_HW 47 | self.patches_resolution = patch_grid_size 48 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 49 | 50 | self.in_chans = in_chans 51 | self.embed_dim = embed_dim 52 | 53 | self.flatten_embedding = flatten_embedding 54 | 55 | self.proj = nn.Conv2d( 56 | in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 57 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 58 | 59 | def forward(self, x: Tensor) -> Tensor: 60 | _, _, H, W = x.shape 61 | patch_H, patch_W = self.patch_size 62 | 63 | assert ( 64 | H % patch_H == 0 65 | ), f'Input image height {H} is not a multiple of patch height {patch_H}' # noqa: E501 66 | assert ( 67 | W % patch_W == 0 68 | ), f'Input image width {W} is not a multiple of patch width: {patch_W}' 69 | 70 | x = self.proj(x) # B C H W 71 | H, W = x.size(2), x.size(3) 72 | x = x.flatten(2).transpose(1, 2) # B HW C 73 | x = self.norm(x) 74 | if not self.flatten_embedding: 75 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 76 | return x 77 | 78 | def flops(self) -> float: 79 | Ho, Wo = self.patches_resolution 80 | flops = ( 81 | Ho * Wo * self.embed_dim * self.in_chans * 82 | (self.patch_size[0] * self.patch_size[1])) 83 | if self.norm is not None: 84 | flops += Ho * Wo * self.embed_dim 85 | return flops 86 | -------------------------------------------------------------------------------- /limoe/models/seg_heads/linear_head.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import torch 4 | import torch.nn as nn 5 | from mmdet3d.models import Base3DDecodeHead 6 | from mmdet3d.registry import MODELS 7 | from mmdet3d.structures.det3d_data_sample import SampleList 8 | from mmdet3d.utils import ConfigType, OptConfigType 9 | from torch import Tensor 10 | 11 | 12 | @MODELS.register_module() 13 | class LinearHead(Base3DDecodeHead): 14 | 15 | def __init__(self, loss_lovasz: OptConfigType = None, **kwargs) -> None: 16 | super(LinearHead, self).__init__(**kwargs) 17 | 18 | if loss_lovasz is not None: 19 | self.loss_lovasz = MODELS.build(loss_lovasz) 20 | else: 21 | self.loss_lovasz = None 22 | 23 | def build_conv_seg(self, channels: int, num_classes: int, 24 | kernel_size: int) -> nn.Module: 25 | return nn.Linear(channels, num_classes) 26 | 27 | def forward(self, feat_dict: dict) -> dict: 28 | logits = self.cls_seg(feat_dict['voxel_feats']) 29 | feat_dict['logits'] = logits 30 | return feat_dict 31 | 32 | def loss_by_feat(self, feat_dict: dict, 33 | batch_data_samples: SampleList) -> Dict[str, Tensor]: 34 | voxel_semantic_segs = [] 35 | voxel_inds = feat_dict['voxel_inds'] 36 | for batch_idx, data_sample in enumerate(batch_data_samples): 37 | pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask 38 | voxel_semantic_mask = pts_semantic_mask[voxel_inds[batch_idx]] 39 | voxel_semantic_segs.append(voxel_semantic_mask) 40 | seg_label = torch.cat(voxel_semantic_segs) 41 | seg_logit_feat = feat_dict['logits'] 42 | loss = dict() 43 | loss['loss_ce'] = self.loss_decode( 44 | seg_logit_feat, seg_label, ignore_index=self.ignore_index) 45 | if self.loss_lovasz is not None: 46 | loss['loss_lovasz'] = self.loss_lovasz( 47 | seg_logit_feat, seg_label, ignore_index=self.ignore_index) 48 | return loss 49 | 50 | def predict(self, feat_dict: dict, batch_input_metas: List[dict], 51 | test_cfg: ConfigType) -> List[Tensor]: 52 | feat_dict = self.forward(feat_dict) 53 | seg_pred_list = self.predict_by_feat(feat_dict, batch_input_metas) 54 | return seg_pred_list 55 | 56 | def predict_by_feat(self, feat_dict: dict, 57 | batch_input_metas: List[dict]) -> List[Tensor]: 58 | seg_logits = feat_dict['logits'] 59 | 60 | seg_pred_list = [] 61 | coors = feat_dict['coors'] 62 | for batch_idx in range(len(batch_input_metas)): 63 | batch_mask = coors[:, -1] == batch_idx 64 | seg_logits_sample = seg_logits[batch_mask] 65 | point2voxel_map = feat_dict['point2voxel_maps'][batch_idx].long() 66 | point_seg_predicts = seg_logits_sample[point2voxel_map] 67 | seg_pred_list.append(point_seg_predicts) 68 | 69 | return seg_pred_list 70 | -------------------------------------------------------------------------------- /configs/_base_/datasets/nuscenes_pretrain.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesSegDataset' 2 | data_root = 'data/nuscenes/' 3 | class_names = [ 4 | 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle', 5 | 'pedestrian', 'traffic_cone', 'trailer', 'truck', 'driveable_surface', 6 | 'other_flat', 'sidewalk', 'terrain', 'manmade', 'vegetation' 7 | ] 8 | labels_map = { 9 | 0: 16, 10 | 1: 16, 11 | 2: 6, 12 | 3: 6, 13 | 4: 6, 14 | 5: 16, 15 | 6: 6, 16 | 7: 16, 17 | 8: 16, 18 | 9: 0, 19 | 10: 16, 20 | 11: 16, 21 | 12: 7, 22 | 13: 16, 23 | 14: 1, 24 | 15: 2, 25 | 16: 2, 26 | 17: 3, 27 | 18: 4, 28 | 19: 16, 29 | 20: 16, 30 | 21: 5, 31 | 22: 8, 32 | 23: 9, 33 | 24: 10, 34 | 25: 11, 35 | 26: 12, 36 | 27: 13, 37 | 28: 14, 38 | 29: 16, 39 | 30: 15, 40 | 31: 16 41 | } 42 | 43 | metainfo = dict( 44 | classes=class_names, seg_label_mapping=labels_map, max_label=31) 45 | input_modality = dict(use_lidar=True, use_camera=True) 46 | 47 | data_prefix = dict( 48 | pts='samples/LIDAR_TOP', 49 | CAM_FRONT='samples/CAM_FRONT', 50 | CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', 51 | CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', 52 | CAM_BACK='samples/CAM_BACK', 53 | CAM_BACK_LEFT='samples/CAM_BACK_LEFT', 54 | CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', 55 | pts_semantic_mask='lidarseg/v1.0-trainval') 56 | 57 | train_pipeline = [ 58 | dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=4), 59 | dict( 60 | type='LoadMultiModalityData', 61 | superpixel_root='data/superpixels/slic/', 62 | num_cameras=6), 63 | dict( 64 | type='RandomFlip3D', 65 | sync_2d=False, 66 | flip_ratio_bev_horizontal=0.5, 67 | flip_ratio_bev_vertical=0.55), 68 | dict( 69 | type='GlobalRotScaleTrans', 70 | rot_range=[0, 3.14159265359], 71 | scale_ratio_range=[0.95, 1.05]), 72 | dict( 73 | type='ResizedCrop', 74 | image_crop_size=[224, 448], 75 | image_crop_ratio=[1.5555555555555556, 1.8888888888888888], 76 | crop_center=True), 77 | dict(type='FlipHorizontal'), 78 | dict( 79 | type='LiMoEInputs', 80 | keys=[ 81 | 'points', 'imgs', 'pairing_points', 'pairing_images', 'superpixels' 82 | ]) 83 | ] 84 | 85 | train_dataloader = dict( 86 | batch_size=4, 87 | num_workers=4, 88 | persistent_workers=True, 89 | sampler=dict(type='DefaultSampler', shuffle=True), 90 | dataset=dict( 91 | type='NuScenesSegDataset', 92 | data_root=data_root, 93 | ann_file='superflow_nus_info.pkl', 94 | data_prefix=data_prefix, 95 | pipeline=train_pipeline, 96 | metainfo=metainfo, 97 | modality=input_modality, 98 | ignore_index=16)) 99 | 100 | vis_backends = [dict(type='LocalVisBackend')] 101 | visualizer = dict( 102 | type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') 103 | -------------------------------------------------------------------------------- /limoe/datasets/nuscenes_dataset.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from typing import Callable, List, Optional, Union 3 | 4 | import numpy as np 5 | from mmdet3d.datasets import Seg3DDataset 6 | from mmdet3d.registry import DATASETS 7 | 8 | 9 | @DATASETS.register_module() 10 | class NuScenesSegDataset(Seg3DDataset): 11 | 12 | METAINFO = { 13 | 'classes': ('barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 14 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 15 | 'truck', 'driveable_surface', 'other_flat', 'sidewalk', 16 | 'terrain', 'manmade', 'vegetation'), 17 | 'palette': [[255, 120, 50], [255, 192, 203], [255, 255, 0], 18 | [0, 150, 245], [0, 255, 255], [255, 127, 0], [255, 0, 0], 19 | [255, 240, 150], [135, 60, 0], [160, 32, 20 | 240], [255, 0, 255], 21 | [139, 137, 137], [75, 0, 75], [150, 240, 80], 22 | [230, 230, 250], [0, 175, 0]], 23 | 'seg_valid_class_ids': 24 | tuple(range(16)), 25 | 'seg_all_class_ids': 26 | tuple(range(16)), 27 | } 28 | 29 | def __init__(self, 30 | data_root: Optional[str] = None, 31 | ann_file: str = '', 32 | metainfo: Optional[dict] = None, 33 | data_prefix: dict = dict( 34 | pts='', 35 | img='', 36 | pts_instance_mask='', 37 | pts_semantic_mask=''), 38 | pipeline: List[Union[dict, Callable]] = [], 39 | modality: dict = dict(use_lidar=True, use_camera=False), 40 | ignore_index: Optional[int] = None, 41 | scene_idxs: Optional[Union[str, np.ndarray]] = None, 42 | test_mode: bool = False, 43 | serialize_data: bool = True, 44 | **kwargs) -> None: 45 | super(NuScenesSegDataset, self).__init__( 46 | data_root=data_root, 47 | ann_file=ann_file, 48 | metainfo=metainfo, 49 | data_prefix=data_prefix, 50 | pipeline=pipeline, 51 | modality=modality, 52 | ignore_index=ignore_index, 53 | scene_idxs=scene_idxs, 54 | test_mode=test_mode, 55 | serialize_data=serialize_data, 56 | **kwargs) 57 | 58 | def get_seg_label_mapping(self, metainfo: dict) -> np.ndarray: 59 | seg_label_mapping = np.zeros(metainfo['max_label'] + 1, dtype=np.int64) 60 | for idx in metainfo['seg_label_mapping']: 61 | seg_label_mapping[idx] = metainfo['seg_label_mapping'][idx] 62 | return seg_label_mapping 63 | 64 | def parse_data_info(self, info: dict) -> dict: 65 | if self.modality['use_lidar']: 66 | info['lidar_points']['lidar_path'] = \ 67 | osp.join( 68 | self.data_prefix.get('pts', ''), 69 | info['lidar_points']['lidar_path']) 70 | if 'num_pts_feats' in info['lidar_points']: 71 | info['num_pts_feats'] = info['lidar_points']['num_pts_feats'] 72 | info['lidar_path'] = info['lidar_points']['lidar_path'] 73 | 74 | if self.modality['use_camera']: 75 | for cam_id, img_info in info['images'].items(): 76 | if 'img_path' in img_info: 77 | if cam_id in self.data_prefix: 78 | cam_prefix = self.data_prefix[cam_id] 79 | else: 80 | cam_prefix = self.data_prefix.get('img', '') 81 | img_info['img_path'] = osp.join(cam_prefix, 82 | img_info['img_path']) 83 | 84 | if 'pts_instance_mask_path' in info: 85 | info['pts_instance_mask_path'] = \ 86 | osp.join(self.data_prefix.get('pts_instance_mask', ''), 87 | info['pts_instance_mask_path']) 88 | 89 | if 'pts_semantic_mask_path' in info: 90 | info['pts_semantic_mask_path'] = \ 91 | osp.join(self.data_prefix.get('pts_semantic_mask', ''), 92 | info['pts_semantic_mask_path']) 93 | 94 | info['seg_label_mapping'] = self.seg_label_mapping 95 | 96 | if self.test_mode and self.load_eval_anns: 97 | info['eval_ann_info'] = dict() 98 | 99 | return info 100 | -------------------------------------------------------------------------------- /limoe/datasets/transforms/loading.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os.path as osp 3 | 4 | import numpy as np 5 | import torch 6 | from mmcv.transforms import BaseTransform 7 | from mmdet3d.registry import TRANSFORMS 8 | from nuscenes.utils.data_classes import LidarPointCloud 9 | from nuscenes.utils.geometry_utils import view_points 10 | from PIL import Image 11 | 12 | 13 | @TRANSFORMS.register_module() 14 | class LoadMultiModalityData(BaseTransform): 15 | 16 | def __init__(self, 17 | superpixel_root: str, 18 | num_cameras: int = 6, 19 | min_dist: float = 1.0) -> None: 20 | self.superpixel_root = superpixel_root 21 | self.min_dist = min_dist 22 | self.num_cameras = num_cameras 23 | 24 | def transform(self, results: dict) -> dict: 25 | points = results['points'].numpy() 26 | pc_original = LidarPointCloud(points.T) 27 | pairing_points = np.empty(0, dtype=np.int64) 28 | pairing_images = np.empty((0, 3), dtype=np.int64) 29 | 30 | images = [] 31 | superpixels = [] 32 | 33 | camera_list = [ 34 | 'CAM_FRONT', 35 | 'CAM_FRONT_LEFT', 36 | 'CAM_FRONT_RIGHT', 37 | 'CAM_BACK', 38 | 'CAM_BACK_LEFT', 39 | 'CAM_BACK_RIGHT', 40 | ] 41 | camera_list = np.random.choice( 42 | camera_list, size=self.num_cameras, replace=False) 43 | np.random.shuffle(camera_list) 44 | 45 | for i, cam in enumerate(camera_list): 46 | # load point clouds 47 | pc = copy.deepcopy(pc_original) 48 | 49 | # load camera images 50 | img = np.array(Image.open(results['images'][cam]['img_path'])) 51 | 52 | # load superpixels 53 | sp_path = osp.join( 54 | self.superpixel_root, 55 | results['images'][cam]['sample_data_token'] + '.png') 56 | sp = np.array(Image.open(sp_path)) 57 | 58 | # transform the point cloud to the vehicle frame for the 59 | # timestamp of the sweep. 60 | pc.rotate(results['lidar2ego_rotation']) 61 | pc.translate(results['lidar2ego_translation']) 62 | 63 | # transform from ego to the global frame. 64 | pc.rotate(results['ego2global_rotation']) 65 | pc.translate(results['ego2global_translation']) 66 | 67 | # transform from global frame to the ego vehicle frame for the 68 | # timestamp of the image. 69 | pc.translate(-results['images'][cam]['ego2global_translation']) 70 | pc.rotate(results['images'][cam]['ego2global_rotation'].T) 71 | 72 | # transform from ego to the camera. 73 | pc.translate(-results['images'][cam]['sensor2ego_translation']) 74 | pc.rotate(results['images'][cam]['sensor2ego_rotation'].T) 75 | 76 | # camera frame z axis points away from the camera 77 | depths = pc.points[2, :] 78 | 79 | # matrix multiplication with camera-matrix + renormalization. 80 | points = view_points( 81 | pc.points[:3, :], 82 | results['images'][cam]['cam_intrinsic'], 83 | normalize=True) 84 | 85 | # Remove points that are either outside or behind the camera. 86 | # Also make sure points are at least 1m in front of the camera to 87 | # avoid seeing the lidar points on the camera. 88 | points = points[:2].T 89 | mask = np.ones(depths.shape[0], dtype=bool) 90 | mask = np.logical_and(mask, depths > self.min_dist) 91 | mask = np.logical_and(mask, points[:, 0] > 0) 92 | mask = np.logical_and(mask, points[:, 0] < img.shape[1] - 1) 93 | mask = np.logical_and(mask, points[:, 1] > 0) 94 | mask = np.logical_and(mask, points[:, 1] < img.shape[0] - 1) 95 | 96 | matching_points = np.where(mask)[0] 97 | matching_pixels = np.round( 98 | np.flip(points[matching_points], axis=1)).astype(np.int64) 99 | 100 | images.append(img / 255.) 101 | superpixels.append(sp) 102 | pairing_points = np.concatenate((pairing_points, matching_points)) 103 | pairing_images = np.concatenate( 104 | (pairing_images, 105 | np.concatenate((np.ones( 106 | (matching_pixels.shape[0], 1), dtype=np.int64) * i, 107 | matching_pixels), 108 | axis=1))) 109 | 110 | results['imgs'] = torch.tensor( 111 | np.array(images, dtype=np.float32).transpose(0, 3, 1, 2)) 112 | results['superpixels'] = torch.tensor(np.stack(superpixels)) 113 | results['pairing_points'] = pairing_points 114 | results['pairing_images'] = pairing_images 115 | return results 116 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import os.path as osp 5 | 6 | from mmdet3d.utils import replace_ceph_backend 7 | from mmengine.config import Config, DictAction 8 | from mmengine.logging import print_log 9 | from mmengine.registry import RUNNERS 10 | from mmengine.runner import Runner 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser(description='Train a 3D model') 15 | parser.add_argument('config', help='train config file path') 16 | parser.add_argument('--work-dir', help='the dir to save logs and models') 17 | parser.add_argument( 18 | '--amp', 19 | action='store_true', 20 | default=False, 21 | help='enable automatic-mixed-precision training') 22 | parser.add_argument( 23 | '--auto-scale-lr', 24 | action='store_true', 25 | help='enable automatically scaling LR.') 26 | parser.add_argument( 27 | '--resume', 28 | nargs='?', 29 | type=str, 30 | const='auto', 31 | help='If specify checkpoint path, resume from it, while if not ' 32 | 'specify, try to auto resume from the latest checkpoint ' 33 | 'in the work directory.') 34 | parser.add_argument( 35 | '--ceph', action='store_true', help='Use ceph as data storage backend') 36 | parser.add_argument( 37 | '--cfg-options', 38 | nargs='+', 39 | action=DictAction, 40 | help='override some settings in the used config, the key-value pair ' 41 | 'in xxx=yyy format will be merged into config file. If the value to ' 42 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 43 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 44 | 'Note that the quotation marks are necessary and that no white space ' 45 | 'is allowed.') 46 | parser.add_argument( 47 | '--launcher', 48 | choices=['none', 'pytorch', 'slurm', 'mpi'], 49 | default='none', 50 | help='job launcher') 51 | # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` 52 | # will pass the `--local-rank` parameter to `tools/train.py` instead 53 | # of `--local_rank`. 54 | parser.add_argument('--local_rank', '--local-rank', type=int, default=0) 55 | args = parser.parse_args() 56 | if 'LOCAL_RANK' not in os.environ: 57 | os.environ['LOCAL_RANK'] = str(args.local_rank) 58 | return args 59 | 60 | 61 | def main(): 62 | args = parse_args() 63 | 64 | # load config 65 | cfg = Config.fromfile(args.config) 66 | 67 | # TODO: We will unify the ceph support approach with other OpenMMLab repos 68 | if args.ceph: 69 | cfg = replace_ceph_backend(cfg) 70 | 71 | cfg.launcher = args.launcher 72 | if args.cfg_options is not None: 73 | cfg.merge_from_dict(args.cfg_options) 74 | 75 | # work_dir is determined in this priority: CLI > segment in file > filename 76 | if args.work_dir is not None: 77 | # update configs according to CLI args if args.work_dir is not None 78 | cfg.work_dir = args.work_dir 79 | elif cfg.get('work_dir', None) is None: 80 | # use config filename as default work_dir if cfg.work_dir is None 81 | cfg.work_dir = osp.join('./work_dirs', 82 | osp.splitext(osp.basename(args.config))[0]) 83 | 84 | # enable automatic-mixed-precision training 85 | if args.amp is True: 86 | optim_wrapper = cfg.optim_wrapper.type 87 | if optim_wrapper == 'AmpOptimWrapper': 88 | print_log( 89 | 'AMP training is already enabled in your config.', 90 | logger='current', 91 | level=logging.WARNING) 92 | else: 93 | assert optim_wrapper == 'OptimWrapper', ( 94 | '`--amp` is only supported when the optimizer wrapper type is ' 95 | f'`OptimWrapper` but got {optim_wrapper}.') 96 | cfg.optim_wrapper.type = 'AmpOptimWrapper' 97 | cfg.optim_wrapper.loss_scale = 'dynamic' 98 | 99 | # enable automatically scaling LR 100 | if args.auto_scale_lr: 101 | if 'auto_scale_lr' in cfg and \ 102 | 'enable' in cfg.auto_scale_lr and \ 103 | 'base_batch_size' in cfg.auto_scale_lr: 104 | cfg.auto_scale_lr.enable = True 105 | else: 106 | raise RuntimeError('Can not find "auto_scale_lr" or ' 107 | '"auto_scale_lr.enable" or ' 108 | '"auto_scale_lr.base_batch_size" in your' 109 | ' configuration file.') 110 | 111 | # resume is determined in this priority: resume from > auto_resume 112 | if args.resume == 'auto': 113 | cfg.resume = True 114 | cfg.load_from = None 115 | elif args.resume is not None: 116 | cfg.resume = True 117 | cfg.load_from = args.resume 118 | 119 | # build the runner from config 120 | if 'runner_type' not in cfg: 121 | # build the default runner 122 | runner = Runner.from_cfg(cfg) 123 | else: 124 | # build customized runner from the registry 125 | # if 'runner_type' is set in the cfg 126 | runner = RUNNERS.build(cfg) 127 | 128 | # start training 129 | runner.train() 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /limoe/datasets/transforms/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Sequence 3 | 4 | import numpy as np 5 | import torch 6 | from mmcv.transforms import BaseTransform 7 | from mmdet3d.registry import TRANSFORMS 8 | from torchvision.transforms import InterpolationMode, RandomResizedCrop 9 | from torchvision.transforms.functional import hflip, resize, resized_crop 10 | 11 | 12 | @TRANSFORMS.register_module() 13 | class ResizedCrop(BaseTransform): 14 | 15 | def __init__(self, 16 | image_crop_size: Sequence[int] = (224, 416), 17 | image_crop_range: Sequence[float] = (0.3, 1.0), 18 | image_crop_ratio: Sequence[float] = (14.0 / 9.0, 17.0 / 9.0), 19 | crop_center: bool = False) -> None: 20 | self.crop_size = image_crop_size 21 | self.crop_range = image_crop_range 22 | self.crop_ratio = image_crop_ratio 23 | self.crop_center = crop_center 24 | 25 | def transform(self, results: dict) -> dict: 26 | images = results['imgs'] 27 | superpixels = results['superpixels'].unsqueeze(1) 28 | pairing_points = results['pairing_points'] 29 | pairing_images = results['pairing_images'] 30 | 31 | imgs = torch.empty( 32 | (images.shape[0], 3) + tuple(self.crop_size), dtype=torch.float32) 33 | sps = torch.empty( 34 | (images.shape[0], ) + tuple(self.crop_size), dtype=torch.uint8) 35 | pairing_points_out = np.empty(0, dtype=np.int64) 36 | pairing_images_out = np.empty((0, 3), dtype=np.int64) 37 | 38 | if self.crop_center: 39 | pairing_points_out = pairing_points 40 | 41 | _, _, h, w = images.shape 42 | for id, img in enumerate(images): 43 | mask = pairing_images[:, 0] == id 44 | p2 = pairing_images[mask] 45 | p2 = np.round( 46 | np.multiply( 47 | p2, 48 | [1.0, self.crop_size[0] / h, self.crop_size[1] / w 49 | ])).astype(np.int64) 50 | imgs[id] = resize(img, self.crop_size, 51 | InterpolationMode.BILINEAR) 52 | sps[id] = resize(superpixels[id], self.crop_size, 53 | InterpolationMode.NEAREST) 54 | p2[:, 1] = np.clip(0, self.crop_size[0] - 1, p2[:, 1]) 55 | p2[:, 2] = np.clip(0, self.crop_size[1] - 1, p2[:, 2]) 56 | pairing_images_out = np.concatenate((pairing_images_out, p2)) 57 | else: 58 | for id, img in enumerate(images): 59 | successful = False 60 | mask = pairing_images[:, 0] == id 61 | P1 = pairing_points[mask] 62 | P2 = pairing_images[mask] 63 | while not successful: 64 | i, j, h, w = RandomResizedCrop.get_params( 65 | img, self.crop_range, self.crop_ratio) 66 | p1 = P1.copy() 67 | p2 = P2.copy() 68 | p2 = np.round( 69 | np.multiply(p2 - [0, i, j], [ 70 | 1.0, self.crop_size[0] / h, self.crop_size[1] / w 71 | ])).astype(np.int64) 72 | valid_indexes_0 = np.logical_and( 73 | p2[:, 1] < self.crop_size[0], p2[:, 1] >= 0) 74 | valid_indexes_1 = np.logical_and( 75 | p2[:, 2] < self.crop_size[1], p2[:, 2] >= 0) 76 | valid_indexes = np.logical_and(valid_indexes_0, 77 | valid_indexes_1) 78 | sum_indexes = valid_indexes.sum() 79 | len_indexes = len(valid_indexes) 80 | if sum_indexes > 1024 or sum_indexes / len_indexes > 0.75: 81 | successful = True 82 | imgs[id] = resized_crop(img, i, j, h, w, self.crop_size, 83 | InterpolationMode.BILINEAR) 84 | sps[id] = resized_crop(superpixels[id], i, j, h, w, 85 | self.crop_size, 86 | InterpolationMode.NEAREST) 87 | pairing_points_out = np.concatenate( 88 | (pairing_points_out, p1[valid_indexes])) 89 | pairing_images_out = np.concatenate( 90 | (pairing_images_out, p2[valid_indexes])) 91 | 92 | results['imgs'] = imgs 93 | results['superpixels'] = sps 94 | results['pairing_points'] = pairing_points_out 95 | results['pairing_images'] = pairing_images_out 96 | return results 97 | 98 | 99 | @TRANSFORMS.register_module() 100 | class FlipHorizontal(BaseTransform): 101 | 102 | def __init__(self, flip_ratio: float = 0.5) -> None: 103 | self.flip_ratio = flip_ratio 104 | 105 | def transform(self, results: dict) -> dict: 106 | images = results['imgs'] 107 | superpixels = results['superpixels'] 108 | pairing_images = results['pairing_images'] 109 | 110 | w = images.shape[3] 111 | for i, img in enumerate(images): 112 | if random.random() < self.flip_ratio: 113 | images[i] = hflip(img) 114 | superpixels[i] = hflip(superpixels[i:i + 1]) 115 | mask = pairing_images[:, 0] == i 116 | pairing_images[mask, 2] = w - 1 - pairing_images[mask, 2] 117 | 118 | results['imgs'] = images 119 | results['superpixels'] = superpixels 120 | results['pairing_images'] = pairing_images 121 | return results 122 | -------------------------------------------------------------------------------- /limoe/models/selfsup/slidr.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from mmdet3d.registry import MODELS 7 | from mmdet3d.structures.det3d_data_sample import (ForwardResults, 8 | OptSampleList, SampleList) 9 | from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig 10 | from mmengine.model import BaseModel 11 | from torch import Tensor 12 | 13 | 14 | class ContrastiveLoss(nn.Module): 15 | 16 | def __init__(self, temperature: float) -> None: 17 | super(ContrastiveLoss, self).__init__() 18 | self.temperature = temperature 19 | self.criterion = nn.CrossEntropyLoss() 20 | 21 | def forward(self, k: Tensor, q: Tensor) -> Tensor: 22 | logits = torch.mm(k, q.transpose(1, 0)) 23 | target = torch.arange(k.shape[0], device=k.device).long() 24 | out = torch.div(logits, self.temperature) 25 | out = out.contiguous() 26 | loss = self.criterion(out, target) 27 | return loss 28 | 29 | 30 | @MODELS.register_module() 31 | class SLidR(BaseModel): 32 | 33 | def __init__(self, 34 | backbone_3d: ConfigType, 35 | head_3d: ConfigType, 36 | backbone_2d: ConfigType, 37 | head_2d: ConfigType, 38 | superpixel_size: int, 39 | temperature: float, 40 | voxel_encoder_3d: OptConfigType = None, 41 | data_preprocessor: OptConfigType = None, 42 | train_cfg: ConfigType = None, 43 | init_cfg: OptMultiConfig = None): 44 | super(SLidR, self).__init__( 45 | data_preprocessor=data_preprocessor, init_cfg=init_cfg) 46 | 47 | self.backbone_2d = MODELS.build(backbone_2d) 48 | self.head_2d = MODELS.build(head_2d) 49 | 50 | self.backbone_3d = MODELS.build(backbone_3d) 51 | self.head_3d = MODELS.build(head_3d) 52 | if voxel_encoder_3d is not None: 53 | self.voxel_encoder_3d = MODELS.build(voxel_encoder_3d) 54 | self.range = True 55 | else: 56 | self.voxel_encoder_3d = None 57 | self.range = False 58 | 59 | self.superpixel_size = superpixel_size 60 | self.contrastive_loss = ContrastiveLoss(temperature) 61 | self.train_cfg = train_cfg 62 | 63 | def extract_3d_feature(self, feat_dict: dict) -> Tensor: 64 | if self.range: 65 | feat_dict = self.voxel_encoder_3d(feat_dict) 66 | feat_dict = self.backbone_3d(feat_dict) 67 | features = self.head_3d(feat_dict)['logits'] 68 | features = F.normalize(features, p=2, dim=1) 69 | return features 70 | 71 | def extract_2d_feature(self, images: Tensor) -> Tensor: 72 | features = self.backbone_2d(images) 73 | features = self.head_2d(features) 74 | features = F.normalize(features, p=2, dim=1) 75 | return features 76 | 77 | def loss(self, inputs: dict, 78 | data_samples: SampleList) -> Dict[str, Tensor]: 79 | 80 | # forward 81 | features_2d = self.extract_2d_feature(inputs['imgs']) 82 | 83 | feat_dict = inputs['ranges'].copy( 84 | ) if self.range else inputs['voxels'].copy() 85 | features_3d = self.extract_3d_feature(feat_dict) 86 | 87 | superpixels = [] 88 | pairing_images = [] 89 | pairing_points = [] 90 | offset = 0 91 | 92 | if self.range: 93 | coors = feat_dict['coors'] 94 | for i, data_sample in enumerate(data_samples): 95 | superpixel = data_sample.gt_pts_seg.superpixels 96 | pairing_image = data_sample.gt_pts_seg.pairing_images 97 | pairing_image[:, 0] += i * superpixel.shape[0] 98 | pairing_point = data_sample.gt_pts_seg.pairing_points 99 | pairing_point = pairing_point.long() + offset 100 | offset += sum(coors[:, 0] == i) 101 | 102 | superpixels.append(superpixel) 103 | pairing_images.append(pairing_image) 104 | pairing_points.append(pairing_point) 105 | 106 | else: 107 | for i, data_sample in enumerate(data_samples): 108 | superpixel = data_sample.gt_pts_seg.superpixels 109 | pairing_image = data_sample.gt_pts_seg.pairing_images 110 | pairing_image[:, 0] += i * superpixel.shape[0] 111 | pairing_point = data_sample.gt_pts_seg.pairing_points 112 | inverse_map = feat_dict['point2voxel_maps'][i] 113 | pairing_point = inverse_map[pairing_point].long() + offset 114 | offset += feat_dict['voxel_inds'][i].shape[0] 115 | 116 | superpixels.append(superpixel) 117 | pairing_images.append(pairing_image) 118 | pairing_points.append(pairing_point) 119 | 120 | superpixels = torch.cat(superpixels) 121 | pairing_images = torch.cat(pairing_images) 122 | pairing_points = torch.cat(pairing_points) 123 | 124 | superpixels = ( 125 | torch.arange( 126 | 0, 127 | features_2d.shape[0] * self.superpixel_size, 128 | self.superpixel_size, 129 | device=features_2d.device)[:, None, None] + superpixels) 130 | 131 | m = tuple(pairing_images.cpu().T.long()) 132 | superpixels_I = superpixels.flatten() 133 | idx_P = torch.arange( 134 | pairing_points.shape[0], device=features_2d.device) 135 | total_pixels = superpixels_I.shape[0] 136 | idx_I = torch.arange(total_pixels, device=features_2d.device) 137 | 138 | with torch.no_grad(): 139 | one_hot_P = torch.sparse_coo_tensor( 140 | torch.stack((superpixels[m], idx_P), dim=0), 141 | torch.ones(pairing_points.shape[0], device=features_2d.device), 142 | (superpixels.shape[0] * self.superpixel_size, 143 | pairing_points.shape[0])) 144 | one_hot_I = torch.sparse_coo_tensor( 145 | torch.stack((superpixels_I, idx_I), dim=0), 146 | torch.ones(total_pixels, device=features_2d.device), 147 | (superpixels.shape[0] * self.superpixel_size, total_pixels)) 148 | 149 | k = one_hot_P @ features_3d[pairing_points] 150 | k = k / (torch.sparse.sum(one_hot_P, 1).to_dense()[:, None] + 1e-6) 151 | q = one_hot_I @ features_2d.permute(0, 2, 3, 1).flatten(0, 2) 152 | q = q / (torch.sparse.sum(one_hot_I, 1).to_dense()[:, None] + 1e-6) 153 | 154 | mask = torch.where(k[:, 0] != 0) 155 | valid_k = k[mask] 156 | valid_q = q[mask] 157 | 158 | loss = dict() 159 | loss['loss_spatial'] = self.contrastive_loss(valid_k, valid_q) 160 | 161 | return loss 162 | 163 | def forward(self, 164 | inputs: dict, 165 | data_samples: OptSampleList = None, 166 | mode: str = 'tensor') -> ForwardResults: 167 | if mode == 'loss': 168 | return self.loss(inputs, data_samples) 169 | -------------------------------------------------------------------------------- /limoe/models/backbones/spvcnn.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchsparse 6 | import torchsparse.nn.functional as F 7 | from mmdet3d.registry import MODELS 8 | from mmdet3d.utils import OptMultiConfig 9 | from torchsparse import PointTensor, SparseTensor 10 | from torchsparse.nn.utils import get_kernel_offsets 11 | 12 | from .minkunet import MinkUNetBackbone 13 | 14 | 15 | @MODELS.register_module() 16 | class SPVCNNBackbone(MinkUNetBackbone): 17 | 18 | def __init__(self, 19 | in_channels: int = 4, 20 | base_channels: int = 32, 21 | layers: Sequence[int] = [2, 3, 4, 6, 2, 2, 2, 2], 22 | planes: Sequence[int] = [32, 64, 128, 256, 256, 128, 96, 96], 23 | block_type: str = 'basic', 24 | bn_momentum: float = 0.1, 25 | drop_ratio: float = 0.3, 26 | init_cfg: OptMultiConfig = None) -> None: 27 | super(SPVCNNBackbone, self).__init__( 28 | in_channels=in_channels, 29 | base_channels=base_channels, 30 | layers=layers, 31 | planes=planes, 32 | block_type=block_type, 33 | bn_momentum=bn_momentum, 34 | init_cfg=init_cfg) 35 | 36 | self.point_transforms = nn.ModuleList([ 37 | nn.Sequential( 38 | nn.Linear(base_channels, planes[3]), nn.BatchNorm1d(planes[3]), 39 | nn.ReLU(True)), 40 | nn.Sequential( 41 | nn.Linear(planes[3], planes[5]), nn.BatchNorm1d(planes[5]), 42 | nn.ReLU(True)), 43 | nn.Sequential( 44 | nn.Linear(planes[5], planes[7]), nn.BatchNorm1d(planes[7]), 45 | nn.ReLU(True)), 46 | ]) 47 | self.dropout = nn.Dropout(drop_ratio, True) 48 | 49 | def forward(self, feat_dict: dict) -> dict: 50 | voxel_features = feat_dict['voxels'] 51 | coors = feat_dict['coors'] 52 | 53 | # x: SparseTensor z: PointTensor 54 | x = SparseTensor(voxel_features, coors) 55 | z = PointTensor(x.F, x.C.float()) 56 | x = initial_voxelize(z) 57 | 58 | out0 = self.conv0(x) 59 | z0 = voxel_to_point(out0, z) 60 | out0 = point_to_voxel(out0, z0) 61 | 62 | out1 = self.conv1(out0) 63 | out1 = self.block1(out1) 64 | 65 | out2 = self.conv2(out1) 66 | out2 = self.block2(out2) 67 | 68 | out3 = self.conv3(out2) 69 | out3 = self.block3(out3) 70 | 71 | out4 = self.conv4(out3) 72 | out4 = self.block4(out4) 73 | 74 | z1 = voxel_to_point(out4, z0, self.point_transforms[0]) 75 | out4 = point_to_voxel(out4, z1) 76 | out4.F = self.dropout(out4.F) 77 | 78 | out = self.conv5(out4) 79 | out = torchsparse.cat((out, out3)) 80 | out = self.block5(out) 81 | 82 | out = self.conv6(out) 83 | out = torchsparse.cat((out, out2)) 84 | out = self.block6(out) 85 | 86 | z2 = voxel_to_point(out, z1, self.point_transforms[1]) 87 | out = point_to_voxel(out, z2) 88 | out.F = self.dropout(out.F) 89 | 90 | out = self.conv7(out) 91 | out = torchsparse.cat((out, out1)) 92 | out = self.block7(out) 93 | 94 | out = self.conv8(out) 95 | out = torchsparse.cat((out, out0)) 96 | out = self.block8(out) 97 | 98 | out = voxel_to_point(out, z2, self.point_transforms[2]) 99 | feat_dict['voxel_feats'] = out.F 100 | return feat_dict 101 | 102 | 103 | def initial_voxelize(points: PointTensor) -> SparseTensor: 104 | """Voxelize again based on input PointTensor. 105 | 106 | Args: 107 | points (PointTensor): Input points after voxelization. 108 | 109 | Returns: 110 | SparseTensor: New voxels. 111 | """ 112 | pc_hash = F.sphash(torch.floor(points.C).int()) 113 | sparse_hash = torch.unique(pc_hash) 114 | idx_query = F.sphashquery(pc_hash, sparse_hash) 115 | counts = F.spcount(idx_query.int(), len(sparse_hash)) 116 | 117 | inserted_coords = F.spvoxelize(torch.floor(points.C), idx_query, counts) 118 | inserted_coords = torch.round(inserted_coords).int() 119 | inserted_feat = F.spvoxelize(points.F, idx_query, counts) 120 | 121 | new_tensor = SparseTensor(inserted_feat, inserted_coords, 1) 122 | new_tensor.cmaps.setdefault(new_tensor.stride, new_tensor.coords) 123 | points.additional_features['idx_query'][1] = idx_query 124 | points.additional_features['counts'][1] = counts 125 | return new_tensor 126 | 127 | 128 | def voxel_to_point(voxels: SparseTensor, 129 | points: PointTensor, 130 | point_transform: Optional[nn.Module] = None, 131 | nearest: bool = False) -> PointTensor: 132 | """Fead voxel features to points. 133 | 134 | Args: 135 | voxels (SparseTensor): Input voxels. 136 | points (PointTensor): Input points. 137 | nearest (bool): Whether to use nearest neighbor interpolation. 138 | Defaults to False. 139 | 140 | Returns: 141 | PointTensor: Points with new features. 142 | """ 143 | if points.idx_query is None or points.weights is None or \ 144 | points.idx_query.get(voxels.s) is None or \ 145 | points.weights.get(voxels.s) is None: 146 | offsets = get_kernel_offsets(2, voxels.s, 1, device=points.F.device) 147 | old_hash = F.sphash( 148 | torch.cat([ 149 | torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0], 150 | points.C[:, -1].int().view(-1, 1) 151 | ], 1), offsets) 152 | pc_hash = F.sphash(voxels.C.to(points.F.device)) 153 | idx_query = F.sphashquery(old_hash, pc_hash) 154 | weights = F.calc_ti_weights( 155 | points.C, idx_query, scale=voxels.s[0]).transpose(0, 156 | 1).contiguous() 157 | idx_query = idx_query.transpose(0, 1).contiguous() 158 | if nearest: 159 | weights[:, 1:] = 0. 160 | idx_query[:, 1:] = -1 161 | new_features = F.spdevoxelize(voxels.F, idx_query, weights) 162 | new_tensor = PointTensor( 163 | new_features, 164 | points.C, 165 | idx_query=points.idx_query, 166 | weights=points.weights) 167 | new_tensor.additional_features = points.additional_features 168 | new_tensor.idx_query[voxels.s] = idx_query 169 | new_tensor.weights[voxels.s] = weights 170 | points.idx_query[voxels.s] = idx_query 171 | points.weights[voxels.s] = weights 172 | else: 173 | new_features = F.spdevoxelize(voxels.F, points.idx_query.get(voxels.s), 174 | points.weights.get(voxels.s)) 175 | new_tensor = PointTensor( 176 | new_features, 177 | points.C, 178 | idx_query=points.idx_query, 179 | weights=points.weights) 180 | new_tensor.additional_features = points.additional_features 181 | 182 | if point_transform is not None: 183 | new_tensor.F = new_tensor.F + point_transform(points.F) 184 | 185 | return new_tensor 186 | 187 | 188 | def point_to_voxel(voxels: SparseTensor, points: PointTensor) -> SparseTensor: 189 | """Feed point features to voxels. 190 | 191 | Args: 192 | voxels (SparseTensor): Input voxels. 193 | points (PointTensor): Input points. 194 | 195 | Returns: 196 | SparseTensor: Voxels with new features. 197 | """ 198 | if points.additional_features is None or \ 199 | points.additional_features.get('idx_query') is None or \ 200 | points.additional_features['idx_query'].get(voxels.s) is None: 201 | pc_hash = F.sphash( 202 | torch.cat([ 203 | torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0], 204 | points.C[:, -1].int().view(-1, 1) 205 | ], 1)) 206 | sparse_hash = F.sphash(voxels.C) 207 | idx_query = F.sphashquery(pc_hash, sparse_hash) 208 | counts = F.spcount(idx_query.int(), voxels.C.shape[0]) 209 | points.additional_features['idx_query'][voxels.s] = idx_query 210 | points.additional_features['counts'][voxels.s] = counts 211 | else: 212 | idx_query = points.additional_features['idx_query'][voxels.s] 213 | counts = points.additional_features['counts'][voxels.s] 214 | 215 | inserted_features = F.spvoxelize(points.F, idx_query, counts) 216 | new_tensor = SparseTensor(inserted_features, voxels.C, voxels.s) 217 | new_tensor.cmaps = voxels.cmaps 218 | new_tensor.kmaps = voxels.kmaps 219 | 220 | return new_tensor 221 | -------------------------------------------------------------------------------- /limoe/models/data_preprocessors/data_preprocessor.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Sequence, Union 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from mmdet3d.registry import MODELS 7 | from mmdet3d.structures.det3d_data_sample import SampleList 8 | from mmengine.model import ImgDataPreprocessor 9 | from mmengine.utils import is_seq_of 10 | from torch import Tensor 11 | 12 | 13 | @MODELS.register_module() 14 | class LiMoEDataPreprocessor(ImgDataPreprocessor): 15 | 16 | def __init__(self, 17 | H: int, 18 | W: int, 19 | fov_up: float, 20 | fov_down: float, 21 | ignore_index: int, 22 | voxel_size: Sequence[float], 23 | voxel_type: str = 'cubic', 24 | mean: Optional[Sequence[Union[float, int]]] = None, 25 | std: Optional[Sequence[Union[float, int]]] = None, 26 | pad_size_divisor: int = 1, 27 | pad_value: Union[float, int] = 0, 28 | to_rgb: bool = False, 29 | bgr_to_rgb: bool = False, 30 | rgb_to_bgr: bool = False, 31 | non_blocking: bool = False) -> None: 32 | super(LiMoEDataPreprocessor, self).__init__( 33 | mean=mean, 34 | std=std, 35 | pad_size_divisor=pad_size_divisor, 36 | pad_value=pad_value, 37 | bgr_to_rgb=bgr_to_rgb, 38 | rgb_to_bgr=rgb_to_bgr, 39 | non_blocking=non_blocking) 40 | self._channel_conversion = to_rgb or bgr_to_rgb or rgb_to_bgr 41 | self.H = H 42 | self.W = W 43 | self.fov_up = fov_up / 180 * np.pi 44 | self.fov_down = fov_down / 180 * np.pi 45 | self.fov = abs(self.fov_down) + abs(self.fov_up) 46 | self.ignore_index = ignore_index 47 | self.voxel_size = voxel_size 48 | self.voxel_type = voxel_type 49 | 50 | def forward(self, data: dict, training: bool = False) -> dict: 51 | data = self.collate_data(data) 52 | inputs, data_samples = data['inputs'], data['data_samples'] 53 | batch_inputs = dict() 54 | 55 | if 'points' in inputs: 56 | batch_inputs['points'] = inputs['points'] 57 | voxel_dict = self.voxelize(inputs['points'], data_samples) 58 | range_dict = self.frustum_region_group(inputs['points'], 59 | data_samples) 60 | batch_inputs['voxels'] = voxel_dict 61 | batch_inputs['ranges'] = range_dict 62 | 63 | if 'imgs' in inputs: 64 | imgs = inputs['imgs'] 65 | 66 | if data_samples is not None: 67 | batch_input_shape = tuple(imgs[0].size()[-2:]) 68 | for data_sample in data_samples: 69 | data_sample.set_metainfo( 70 | {'batch_input_shape': batch_input_shape}) 71 | 72 | batch_inputs['imgs'] = imgs 73 | 74 | return {'inputs': batch_inputs, 'data_samples': data_samples} 75 | 76 | def preprocess_img(self, _batch_img: Tensor) -> Tensor: 77 | if self._channel_conversion: 78 | _batch_img = _batch_img[[2, 1, 0], ...] 79 | _batch_img = _batch_img.float() 80 | if self._enable_normalize: 81 | if self.mean.shape[0] == 3: 82 | assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3 83 | _batch_img = (_batch_img - self.mean) / self.std 84 | return _batch_img 85 | 86 | def collate_data(self, data: dict) -> dict: 87 | data = self.cast_data(data) 88 | 89 | if 'imgs' in data['inputs']: 90 | _batch_imgs = data['inputs']['imgs'] 91 | assert is_seq_of(_batch_imgs, Tensor) 92 | 93 | batch_imgs = [] 94 | for _batch_img in _batch_imgs: 95 | _batch_img = [self.preprocess_img(_img) for _img in _batch_img] 96 | _batch_img = torch.stack(_batch_img, dim=0) 97 | batch_imgs.append(_batch_img) 98 | 99 | batch_imgs = torch.concat(batch_imgs, dim=0) 100 | data['inputs']['imgs'] = batch_imgs 101 | 102 | data.setdefault('data_samples', None) 103 | return data 104 | 105 | @torch.no_grad() 106 | def voxelize(self, points: List[Tensor], data_samples: SampleList) -> dict: 107 | voxel_dict = dict() 108 | 109 | voxels = [] 110 | coors = [] 111 | point2voxel_maps = [] 112 | voxel_inds = [] 113 | 114 | voxel_size = points[0].new_tensor(self.voxel_size) 115 | 116 | for i, res in enumerate(points): 117 | if self.voxel_type == 'cubic': 118 | res_coors = torch.round(res[:, :3] / voxel_size).int() 119 | elif self.voxel_type == 'cylinder': 120 | rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2) 121 | phi = torch.atan2(res[:, 1], res[:, 0]) * 180 / np.pi 122 | polar_res = torch.stack((rho, phi, res[:, 2]), dim=1) 123 | res_coors = torch.round(polar_res[:, :3] / voxel_size).int() 124 | 125 | res_coors -= res_coors.min(0)[0] 126 | 127 | res_coors_numpy = res_coors.cpu().numpy() 128 | inds, point2voxel_map = self.sparse_quantize( 129 | res_coors_numpy, return_index=True, return_inverse=True) 130 | point2voxel_map = torch.from_numpy(point2voxel_map).cuda() 131 | inds = torch.from_numpy(inds).cuda() 132 | res_voxel_coors = res_coors[inds] 133 | res_voxels = res[inds] 134 | res_voxel_coors = F.pad( 135 | res_voxel_coors, (0, 1), mode='constant', value=i) 136 | voxels.append(res_voxels) 137 | coors.append(res_voxel_coors) 138 | point2voxel_maps.append(point2voxel_map) 139 | voxel_inds.append(inds) 140 | 141 | voxels = torch.cat(voxels, dim=0) 142 | coors = torch.cat(coors, dim=0) 143 | 144 | voxel_dict['voxels'] = voxels 145 | voxel_dict['coors'] = coors 146 | voxel_dict['point2voxel_maps'] = point2voxel_maps 147 | voxel_dict['voxel_inds'] = voxel_inds 148 | 149 | return voxel_dict 150 | 151 | def ravel_hash(self, x: np.ndarray) -> np.ndarray: 152 | assert x.ndim == 2, x.shape 153 | 154 | x = x - np.min(x, axis=0) 155 | x = x.astype(np.uint64, copy=False) 156 | xmax = np.max(x, axis=0).astype(np.uint64) + 1 157 | 158 | h = np.zeros(x.shape[0], dtype=np.uint64) 159 | for k in range(x.shape[1] - 1): 160 | h += x[:, k] 161 | h *= xmax[k + 1] 162 | h += x[:, -1] 163 | return h 164 | 165 | def sparse_quantize(self, 166 | coords: np.ndarray, 167 | return_index: bool = False, 168 | return_inverse: bool = False) -> List[np.ndarray]: 169 | _, indices, inverse_indices = np.unique( 170 | self.ravel_hash(coords), return_index=True, return_inverse=True) 171 | 172 | outputs = [] 173 | if return_index: 174 | outputs += [indices] 175 | if return_inverse: 176 | outputs += [inverse_indices] 177 | return outputs 178 | 179 | @torch.no_grad() 180 | def frustum_region_group(self, points: List[Tensor], 181 | data_samples: SampleList) -> dict: 182 | range_dict = dict() 183 | 184 | coors = [] 185 | voxels = [] 186 | 187 | for i, res in enumerate(points): 188 | depth = torch.linalg.norm(res[:, :3], 2, dim=1) 189 | yaw = -torch.atan2(res[:, 1], res[:, 0]) 190 | pitch = torch.arcsin(res[:, 2] / depth) 191 | 192 | coors_x = 0.5 * (yaw / np.pi + 1.0) 193 | coors_y = 1.0 - (pitch + abs(self.fov_down)) / self.fov 194 | 195 | # scale to image size using angular resolution 196 | coors_x *= self.W 197 | coors_y *= self.H 198 | 199 | # round and clamp for use as index 200 | coors_x = torch.floor(coors_x) 201 | coors_x = torch.clamp( 202 | coors_x, min=0, max=self.W - 1).type(torch.int64) 203 | 204 | coors_y = torch.floor(coors_y) 205 | coors_y = torch.clamp( 206 | coors_y, min=0, max=self.H - 1).type(torch.int64) 207 | 208 | res_coors = torch.stack([coors_y, coors_x], dim=1) 209 | res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i) 210 | coors.append(res_coors) 211 | voxels.append(res) 212 | 213 | if 'pts_semantic_mask' in data_samples[i].gt_pts_seg: 214 | import torch_scatter 215 | pts_semantic_mask = data_samples[ 216 | i].gt_pts_seg.pts_semantic_mask 217 | seg_label = torch.ones( 218 | (self.H, self.W), 219 | dtype=torch.long, 220 | device=pts_semantic_mask.device) * self.ignore_index 221 | res_voxel_coors, inverse_map = torch.unique( 222 | res_coors, return_inverse=True, dim=0) 223 | voxel_semantic_mask = torch_scatter.scatter_mean( 224 | F.one_hot(pts_semantic_mask).float(), inverse_map, dim=0) 225 | voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1) 226 | seg_label[res_voxel_coors[:, 1], 227 | res_voxel_coors[:, 2]] = voxel_semantic_mask 228 | data_samples[i].gt_pts_seg.semantic_seg = seg_label 229 | 230 | voxels = torch.cat(voxels, dim=0) 231 | coors = torch.cat(coors, dim=0) 232 | range_dict['voxels'] = voxels 233 | range_dict['coors'] = coors 234 | 235 | return range_dict 236 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/layers/block.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Callable, Dict, List, Tuple 3 | 4 | import torch 5 | from torch import Tensor, nn 6 | 7 | from .attention import Attention, MemEffAttention 8 | from .drop_path import DropPath 9 | from .layer_scale import LayerScale 10 | from .mlp import Mlp 11 | 12 | logger = logging.getLogger('dinov2') 13 | 14 | try: 15 | from xformers.ops import fmha, index_select_cat, scaled_index_add 16 | 17 | XFORMERS_AVAILABLE = True 18 | except ImportError: 19 | logger.warning('xFormers not available') 20 | XFORMERS_AVAILABLE = False 21 | 22 | 23 | class Block(nn.Module): 24 | 25 | def __init__( 26 | self, 27 | dim: int, 28 | num_heads: int, 29 | mlp_ratio: float = 4.0, 30 | qkv_bias: bool = False, 31 | proj_bias: bool = True, 32 | ffn_bias: bool = True, 33 | drop: float = 0.0, 34 | attn_drop: float = 0.0, 35 | init_values=None, 36 | drop_path: float = 0.0, 37 | act_layer: Callable[..., nn.Module] = nn.GELU, 38 | norm_layer: Callable[..., nn.Module] = nn.LayerNorm, 39 | attn_class: Callable[..., nn.Module] = Attention, 40 | ffn_layer: Callable[..., nn.Module] = Mlp, 41 | ) -> None: 42 | super().__init__() 43 | # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") 44 | self.norm1 = norm_layer(dim) 45 | self.attn = attn_class( 46 | dim, 47 | num_heads=num_heads, 48 | qkv_bias=qkv_bias, 49 | proj_bias=proj_bias, 50 | attn_drop=attn_drop, 51 | proj_drop=drop, 52 | ) 53 | self.ls1 = ( 54 | LayerScale(dim, init_values=init_values) 55 | if init_values else nn.Identity()) 56 | self.drop_path1 = DropPath( 57 | drop_path) if drop_path > 0.0 else nn.Identity() 58 | 59 | self.norm2 = norm_layer(dim) 60 | mlp_hidden_dim = int(dim * mlp_ratio) 61 | self.mlp = ffn_layer( 62 | in_features=dim, 63 | hidden_features=mlp_hidden_dim, 64 | act_layer=act_layer, 65 | drop=drop, 66 | bias=ffn_bias, 67 | ) 68 | self.ls2 = ( 69 | LayerScale(dim, init_values=init_values) 70 | if init_values else nn.Identity()) 71 | self.drop_path2 = DropPath( 72 | drop_path) if drop_path > 0.0 else nn.Identity() 73 | 74 | self.sample_drop_ratio = drop_path 75 | 76 | def forward(self, x: Tensor) -> Tensor: 77 | 78 | def attn_residual_func(x: Tensor) -> Tensor: 79 | return self.ls1(self.attn(self.norm1(x))) 80 | 81 | def ffn_residual_func(x: Tensor) -> Tensor: 82 | return self.ls2(self.mlp(self.norm2(x))) 83 | 84 | if self.training and self.sample_drop_ratio > 0.1: 85 | # the overhead is compensated only for a drop path rate 86 | # larger than 0.1 87 | x = drop_add_residual_stochastic_depth( 88 | x, 89 | residual_func=attn_residual_func, 90 | sample_drop_ratio=self.sample_drop_ratio, 91 | ) 92 | x = drop_add_residual_stochastic_depth( 93 | x, 94 | residual_func=ffn_residual_func, 95 | sample_drop_ratio=self.sample_drop_ratio, 96 | ) 97 | elif self.training and self.sample_drop_ratio > 0.0: 98 | x = x + self.drop_path1(attn_residual_func(x)) 99 | x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 100 | else: 101 | x = x + attn_residual_func(x) 102 | x = x + ffn_residual_func(x) 103 | return x 104 | 105 | 106 | def drop_add_residual_stochastic_depth( 107 | x: Tensor, 108 | residual_func: Callable[[Tensor], Tensor], 109 | sample_drop_ratio: float = 0.0, 110 | ) -> Tensor: 111 | # 1) extract subset using permutation 112 | b, n, d = x.shape 113 | sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) 114 | brange = (torch.randperm(b, device=x.device))[:sample_subset_size] 115 | x_subset = x[brange] 116 | 117 | # 2) apply residual_func to get residual 118 | residual = residual_func(x_subset) 119 | 120 | x_flat = x.flatten(1) 121 | residual = residual.flatten(1) 122 | 123 | residual_scale_factor = b / sample_subset_size 124 | 125 | # 3) add the residual 126 | x_plus_residual = torch.index_add( 127 | x_flat, 128 | 0, 129 | brange, 130 | residual.to(dtype=x.dtype), 131 | alpha=residual_scale_factor) 132 | return x_plus_residual.view_as(x) 133 | 134 | 135 | def get_branges_scales(x, sample_drop_ratio=0.0): 136 | b, n, d = x.shape 137 | sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) 138 | brange = (torch.randperm(b, device=x.device))[:sample_subset_size] 139 | residual_scale_factor = b / sample_subset_size 140 | return brange, residual_scale_factor 141 | 142 | 143 | def add_residual(x, 144 | brange, 145 | residual, 146 | residual_scale_factor, 147 | scaling_vector=None): 148 | if scaling_vector is None: 149 | x_flat = x.flatten(1) 150 | residual = residual.flatten(1) 151 | x_plus_residual = torch.index_add( 152 | x_flat, 153 | 0, 154 | brange, 155 | residual.to(dtype=x.dtype), 156 | alpha=residual_scale_factor) 157 | else: 158 | x_plus_residual = scaled_index_add( 159 | x, 160 | brange, 161 | residual.to(dtype=x.dtype), 162 | scaling=scaling_vector, 163 | alpha=residual_scale_factor, 164 | ) 165 | return x_plus_residual 166 | 167 | 168 | attn_bias_cache: Dict[Tuple, Any] = {} 169 | 170 | 171 | def get_attn_bias_and_cat(x_list, branges=None): 172 | """this will perform the index select, cat the tensors, and provide the 173 | attn_bias from cache.""" 174 | batch_sizes = ([b.shape[0] for b in branges] 175 | if branges is not None else [x.shape[0] for x in x_list]) 176 | all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) 177 | if all_shapes not in attn_bias_cache.keys(): 178 | seqlens = [] 179 | for b, x in zip(batch_sizes, x_list): 180 | for _ in range(b): 181 | seqlens.append(x.shape[1]) 182 | attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) 183 | attn_bias._batch_sizes = batch_sizes 184 | attn_bias_cache[all_shapes] = attn_bias 185 | 186 | if branges is not None: 187 | cat_tensors = index_select_cat([x.flatten(1) for x in x_list], 188 | branges).view(1, -1, 189 | x_list[0].shape[-1]) 190 | else: 191 | tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) 192 | cat_tensors = torch.cat(tensors_bs1, dim=1) 193 | 194 | return attn_bias_cache[all_shapes], cat_tensors 195 | 196 | 197 | def drop_add_residual_stochastic_depth_list( 198 | x_list: List[Tensor], 199 | residual_func: Callable[[Tensor, Any], Tensor], 200 | sample_drop_ratio: float = 0.0, 201 | scaling_vector=None, 202 | ) -> Tensor: 203 | # 1) generate random set of indices for dropping samples in the batch 204 | branges_scales = [ 205 | get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) 206 | for x in x_list 207 | ] 208 | branges = [s[0] for s in branges_scales] 209 | residual_scale_factors = [s[1] for s in branges_scales] 210 | 211 | # 2) get attention bias and index+concat the tensors 212 | attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) 213 | 214 | # 3) apply residual_func to get residual, and split the result 215 | residual_list = attn_bias.split(residual_func( 216 | x_cat, attn_bias=attn_bias)) # type: ignore 217 | 218 | outputs = [] 219 | for x, brange, residual, residual_scale_factor in zip( 220 | x_list, branges, residual_list, residual_scale_factors): 221 | outputs.append( 222 | add_residual(x, brange, residual, residual_scale_factor, 223 | scaling_vector).view_as(x)) 224 | return outputs 225 | 226 | 227 | class NestedTensorBlock(Block): 228 | 229 | def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: 230 | """x_list contains a list of tensors to nest together and run.""" 231 | assert isinstance(self.attn, MemEffAttention) 232 | 233 | if self.training and self.sample_drop_ratio > 0.0: 234 | 235 | def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: 236 | return self.attn(self.norm1(x), attn_bias=attn_bias) 237 | 238 | def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: 239 | return self.mlp(self.norm2(x)) 240 | 241 | x_list = drop_add_residual_stochastic_depth_list( 242 | x_list, 243 | residual_func=attn_residual_func, 244 | sample_drop_ratio=self.sample_drop_ratio, 245 | scaling_vector=self.ls1.gamma if isinstance( 246 | self.ls1, LayerScale) else None, 247 | ) 248 | x_list = drop_add_residual_stochastic_depth_list( 249 | x_list, 250 | residual_func=ffn_residual_func, 251 | sample_drop_ratio=self.sample_drop_ratio, 252 | scaling_vector=self.ls2.gamma if isinstance( 253 | self.ls1, LayerScale) else None, 254 | ) 255 | return x_list 256 | else: 257 | 258 | def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: 259 | return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) 260 | 261 | def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: 262 | return self.ls2(self.mlp(self.norm2(x))) 263 | 264 | attn_bias, x = get_attn_bias_and_cat(x_list) 265 | x = x + attn_residual_func(x, attn_bias=attn_bias) 266 | x = x + ffn_residual_func(x) 267 | return attn_bias.split(x) 268 | 269 | def forward(self, x_or_x_list): 270 | if isinstance(x_or_x_list, Tensor): 271 | return super().forward(x_or_x_list) 272 | elif isinstance(x_or_x_list, list): 273 | assert (XFORMERS_AVAILABLE 274 | ), 'Please install xFormers for nested tensors usage' 275 | return self.forward_nested(x_or_x_list) 276 | else: 277 | raise AssertionError 278 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018-2019 Open-MMLab. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2018-2019 Open-MMLab. 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /limoe/models/backbones/minkunet.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | import torch.nn as nn 4 | import torchsparse 5 | import torchsparse.nn as spnn 6 | from mmdet3d.registry import MODELS 7 | from mmdet3d.utils import OptMultiConfig 8 | from mmengine.model import BaseModule 9 | from torchsparse.tensor import SparseTensor 10 | 11 | 12 | class TorchsparseConvModule(BaseModule): 13 | 14 | def __init__(self, 15 | inplanes: int, 16 | planes: int, 17 | kernel_size: int, 18 | stride: int = 1, 19 | dilation: int = 1, 20 | bn_momentum: float = 0.1, 21 | transposed: bool = False, 22 | activate: bool = True, 23 | init_cfg: OptMultiConfig = None) -> None: 24 | super(TorchsparseConvModule, self).__init__(init_cfg=init_cfg) 25 | 26 | self.conv = spnn.Conv3d( 27 | inplanes, 28 | planes, 29 | kernel_size=kernel_size, 30 | stride=stride, 31 | dilation=dilation, 32 | bias=False, 33 | transposed=transposed) 34 | self.norm = spnn.BatchNorm(planes, momentum=bn_momentum) 35 | if activate: 36 | self.relu = spnn.ReLU(inplace=True) 37 | else: 38 | self.relu = None 39 | 40 | def forward(self, x: SparseTensor) -> SparseTensor: 41 | out = self.conv(x) 42 | out = self.norm(out) 43 | if self.relu is not None: 44 | out = self.relu(out) 45 | return out 46 | 47 | 48 | class TorchsparseBasicBlock(BaseModule): 49 | expansion = 1 50 | 51 | def __init__(self, 52 | inplanes: int, 53 | planes: int, 54 | stride: int = 1, 55 | dilation: int = 1, 56 | downsample: Optional[nn.Module] = None, 57 | bn_momentum: float = 0.1, 58 | init_cfg: OptMultiConfig = None) -> None: 59 | super(TorchsparseBasicBlock, self).__init__(init_cfg=init_cfg) 60 | 61 | self.conv1 = spnn.Conv3d( 62 | inplanes, 63 | planes, 64 | kernel_size=3, 65 | stride=stride, 66 | dilation=dilation, 67 | bias=False) 68 | self.norm1 = spnn.BatchNorm(planes, momentum=bn_momentum) 69 | 70 | self.conv2 = spnn.Conv3d( 71 | planes, 72 | planes, 73 | kernel_size=3, 74 | stride=1, 75 | dilation=dilation, 76 | bias=False) 77 | self.norm2 = spnn.BatchNorm(planes, momentum=bn_momentum) 78 | self.relu = spnn.ReLU(inplace=True) 79 | self.downsample = downsample 80 | 81 | def forward(self, x: SparseTensor) -> SparseTensor: 82 | residual = x 83 | 84 | out = self.conv1(x) 85 | out = self.norm1(out) 86 | out = self.relu(out) 87 | 88 | out = self.conv2(out) 89 | out = self.norm2(out) 90 | 91 | if self.downsample is not None: 92 | residual = self.downsample(x) 93 | 94 | out += residual 95 | out = self.relu(out) 96 | return out 97 | 98 | 99 | class TorchsparseBottleneck(BaseModule): 100 | expansion = 4 101 | 102 | def __init__(self, 103 | inplanes: int, 104 | planes: int, 105 | stride: int = 1, 106 | dilation: int = 1, 107 | downsample: Optional[nn.Module] = None, 108 | bn_momentum: float = 0.1, 109 | init_cfg: OptMultiConfig = None) -> None: 110 | super(TorchsparseBottleneck, self).__init__(init_cfg=init_cfg) 111 | 112 | self.conv1 = spnn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 113 | self.norm1 = spnn.BatchNorm(planes, momentum=bn_momentum) 114 | 115 | self.conv2 = spnn.Conv3d( 116 | planes, 117 | planes, 118 | kernel_size=3, 119 | stride=stride, 120 | dilation=dilation, 121 | bias=False) 122 | self.norm2 = spnn.BatchNorm(planes, momentum=bn_momentum) 123 | 124 | self.conv3 = spnn.Conv3d( 125 | planes, planes * self.expansion, kernel_size=1, bias=False) 126 | self.norm3 = spnn.BatchNorm( 127 | planes * self.expansion, momentum=bn_momentum) 128 | 129 | self.relu = spnn.ReLU(inplace=True) 130 | self.downsample = downsample 131 | 132 | def forward(self, x: SparseTensor) -> SparseTensor: 133 | residual = x 134 | 135 | out = self.conv1(x) 136 | out = self.norm1(out) 137 | out = self.relu(out) 138 | 139 | out = self.conv2(out) 140 | out = self.norm2(out) 141 | out = self.relu(out) 142 | 143 | out = self.conv3(out) 144 | out = self.norm3(out) 145 | 146 | if self.downsample is not None: 147 | residual = self.downsample(x) 148 | 149 | out += residual 150 | out = self.relu(out) 151 | return out 152 | 153 | 154 | @MODELS.register_module() 155 | class MinkUNetBackbone(BaseModule): 156 | 157 | def __init__(self, 158 | in_channels: int = 4, 159 | base_channels: int = 32, 160 | layers: Sequence[int] = [2, 3, 4, 6, 2, 2, 2, 2], 161 | planes: Sequence[int] = [32, 64, 128, 256, 256, 128, 96, 96], 162 | block_type: str = 'basic', 163 | bn_momentum: float = 0.1, 164 | init_cfg: OptMultiConfig = None) -> None: 165 | super(MinkUNetBackbone, self).__init__(init_cfg=init_cfg) 166 | assert block_type in ['basic', 'bottleneck'] 167 | 168 | conv_module = TorchsparseConvModule 169 | if block_type == 'basic': 170 | block = TorchsparseBasicBlock 171 | elif block_type == 'bottleneck': 172 | block = TorchsparseBottleneck 173 | 174 | self.conv0 = nn.Sequential( 175 | conv_module( 176 | in_channels, 177 | base_channels, 178 | kernel_size=3, 179 | bn_momentum=bn_momentum), 180 | conv_module( 181 | base_channels, 182 | base_channels, 183 | kernel_size=3, 184 | bn_momentum=bn_momentum)) 185 | 186 | self.inplanes = base_channels 187 | 188 | self.conv1 = conv_module( 189 | self.inplanes, 190 | self.inplanes, 191 | kernel_size=2, 192 | stride=2, 193 | dilation=1, 194 | bn_momentum=bn_momentum) 195 | self.block1 = self._make_layer( 196 | block, conv_module, planes[0], layers[0], bn_momentum=bn_momentum) 197 | 198 | self.conv2 = conv_module( 199 | self.inplanes, 200 | self.inplanes, 201 | kernel_size=2, 202 | stride=2, 203 | dilation=1, 204 | bn_momentum=bn_momentum) 205 | self.block2 = self._make_layer( 206 | block, conv_module, planes[1], layers[1], bn_momentum=bn_momentum) 207 | 208 | self.conv3 = conv_module( 209 | self.inplanes, 210 | self.inplanes, 211 | kernel_size=2, 212 | stride=2, 213 | dilation=1, 214 | bn_momentum=bn_momentum) 215 | self.block3 = self._make_layer( 216 | block, conv_module, planes[2], layers[2], bn_momentum=bn_momentum) 217 | 218 | self.conv4 = conv_module( 219 | self.inplanes, 220 | self.inplanes, 221 | kernel_size=2, 222 | stride=2, 223 | dilation=1, 224 | bn_momentum=bn_momentum) 225 | self.block4 = self._make_layer( 226 | block, conv_module, planes[3], layers[3], bn_momentum=bn_momentum) 227 | 228 | self.conv5 = conv_module( 229 | self.inplanes, 230 | planes[4], 231 | kernel_size=2, 232 | stride=2, 233 | dilation=1, 234 | bn_momentum=bn_momentum, 235 | transposed=True) 236 | self.inplanes = planes[4] + planes[2] * block.expansion 237 | self.block5 = self._make_layer( 238 | block, conv_module, planes[4], layers[4], bn_momentum=bn_momentum) 239 | 240 | self.conv6 = conv_module( 241 | self.inplanes, 242 | planes[5], 243 | kernel_size=2, 244 | stride=2, 245 | dilation=1, 246 | bn_momentum=bn_momentum, 247 | transposed=True) 248 | self.inplanes = planes[5] + planes[1] * block.expansion 249 | self.block6 = self._make_layer( 250 | block, conv_module, planes[5], layers[5], bn_momentum=bn_momentum) 251 | 252 | self.conv7 = conv_module( 253 | self.inplanes, 254 | planes[6], 255 | kernel_size=2, 256 | stride=2, 257 | dilation=1, 258 | bn_momentum=bn_momentum, 259 | transposed=True) 260 | self.inplanes = planes[6] + planes[0] * block.expansion 261 | self.block7 = self._make_layer( 262 | block, conv_module, planes[6], layers[6], bn_momentum=bn_momentum) 263 | 264 | self.conv8 = conv_module( 265 | self.inplanes, 266 | planes[7], 267 | kernel_size=2, 268 | stride=2, 269 | dilation=1, 270 | bn_momentum=bn_momentum, 271 | transposed=True) 272 | self.inplanes = planes[7] + base_channels 273 | self.block8 = self._make_layer( 274 | block, conv_module, planes[7], layers[7], bn_momentum=bn_momentum) 275 | 276 | def _make_layer(self, 277 | block: nn.Module, 278 | conv_module: nn.Module, 279 | planes: int, 280 | blocks: int, 281 | stride: int = 1, 282 | dilation: int = 1, 283 | bn_momentum: float = 0.1) -> nn.Module: 284 | downsample = None 285 | if stride != 1 or self.inplanes != planes * block.expansion: 286 | downsample = conv_module( 287 | self.inplanes, 288 | planes * block.expansion, 289 | kernel_size=1, 290 | stride=stride, 291 | bn_momentum=bn_momentum, 292 | activate=False) 293 | layers = [] 294 | 295 | layers.append( 296 | block( 297 | self.inplanes, 298 | planes, 299 | stride=stride, 300 | dilation=dilation, 301 | bn_momentum=bn_momentum, 302 | downsample=downsample)) 303 | 304 | self.inplanes = planes * block.expansion 305 | for i in range(1, blocks): 306 | layers.append( 307 | block( 308 | self.inplanes, 309 | planes, 310 | stride=1, 311 | dilation=dilation, 312 | bn_momentum=bn_momentum)) 313 | 314 | return nn.Sequential(*layers) 315 | 316 | def forward(self, feat_dict: dict) -> dict: 317 | voxel_features = feat_dict['voxels'] 318 | coors = feat_dict['coors'] 319 | x = torchsparse.SparseTensor(voxel_features, coors) 320 | 321 | out1 = self.conv0(x) 322 | 323 | out = self.conv1(out1) 324 | out2 = self.block1(out) 325 | 326 | out = self.conv2(out2) 327 | out3 = self.block2(out) 328 | 329 | out = self.conv3(out3) 330 | out4 = self.block3(out) 331 | 332 | out = self.conv4(out4) 333 | out5 = self.block4(out) 334 | 335 | out = self.conv5(out5) 336 | out = torchsparse.cat((out, out4)) 337 | out = self.block5(out) 338 | 339 | out = self.conv6(out) 340 | out = torchsparse.cat((out, out3)) 341 | out = self.block6(out) 342 | 343 | out = self.conv7(out) 344 | out = torchsparse.cat((out, out2)) 345 | out = self.block7(out) 346 | 347 | out = self.conv8(out) 348 | out = torchsparse.cat((out, out1)) 349 | out = self.block8(out) 350 | 351 | feat_dict['voxel_feats'] = out.F 352 | return feat_dict 353 | -------------------------------------------------------------------------------- /limoe/models/backbones/dinov2/dinov2_vision_transformer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from functools import partial 4 | from typing import Callable, Sequence, Tuple, Union 5 | 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn.init import trunc_normal_ 9 | 10 | from .layers import MemEffAttention, Mlp 11 | from .layers import NestedTensorBlock as Block 12 | from .layers import PatchEmbed, SwiGLUFFNFused 13 | 14 | logger = logging.getLogger('dinov2') 15 | 16 | 17 | def named_apply(fn: Callable, 18 | module: nn.Module, 19 | name='', 20 | depth_first=True, 21 | include_root=False) -> nn.Module: 22 | if not depth_first and include_root: 23 | fn(module=module, name=name) 24 | for child_name, child_module in module.named_children(): 25 | child_name = '.'.join((name, child_name)) if name else child_name 26 | named_apply( 27 | fn=fn, 28 | module=child_module, 29 | name=child_name, 30 | depth_first=depth_first, 31 | include_root=True, 32 | ) 33 | if depth_first and include_root: 34 | fn(module=module, name=name) 35 | return module 36 | 37 | 38 | class BlockChunk(nn.ModuleList): 39 | 40 | def forward(self, x): 41 | for b in self: 42 | x = b(x) 43 | return x 44 | 45 | 46 | class DinoVisionTransformer(nn.Module): 47 | 48 | def __init__( 49 | self, 50 | img_size=224, 51 | patch_size=16, 52 | in_chans=3, 53 | embed_dim=768, 54 | depth=12, 55 | num_heads=12, 56 | mlp_ratio=4.0, 57 | qkv_bias=True, 58 | ffn_bias=True, 59 | proj_bias=True, 60 | drop_path_rate=0.0, 61 | drop_path_uniform=False, 62 | init_values=None, # for layerscale: None or 0 => no layerscale 63 | embed_layer=PatchEmbed, 64 | act_layer=nn.GELU, 65 | block_fn=Block, 66 | ffn_layer='mlp', 67 | block_chunks=1, 68 | ): 69 | """ 70 | Args: 71 | img_size (int, tuple): input image size 72 | patch_size (int, tuple): patch size 73 | in_chans (int): number of input channels 74 | embed_dim (int): embedding dimension 75 | depth (int): depth of transformer 76 | num_heads (int): number of attention heads 77 | mlp_ratio (int): ratio of mlp hidden dim to embedding dim 78 | qkv_bias (bool): enable bias for qkv if True 79 | proj_bias (bool): enable bias for proj in attn if True 80 | ffn_bias (bool): enable bias for ffn if True 81 | drop_path_rate (float): stochastic depth rate 82 | drop_path_uniform (bool): apply uniform drop rate across blocks 83 | weight_init (str): weight init scheme 84 | init_values (float): layer-scale init values 85 | embed_layer (nn.Module): patch embedding layer 86 | act_layer (nn.Module): MLP activation layer 87 | block_fn (nn.Module): transformer block class 88 | ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" 89 | block_chunks: (int) split block sequence into block_chunks units 90 | for FSDP wrap 91 | """ 92 | super().__init__() 93 | norm_layer = partial(nn.LayerNorm, eps=1e-6) 94 | 95 | self.num_features = ( 96 | self.embed_dim 97 | ) = embed_dim # num_features for consistency with other models 98 | self.num_tokens = 1 99 | self.n_blocks = depth 100 | self.num_heads = num_heads 101 | self.patch_size = patch_size 102 | 103 | self.patch_embed = embed_layer( 104 | img_size=img_size, 105 | patch_size=patch_size, 106 | in_chans=in_chans, 107 | embed_dim=embed_dim, 108 | ) 109 | num_patches = self.patch_embed.num_patches 110 | 111 | self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) 112 | self.pos_embed = nn.Parameter( 113 | torch.zeros(1, num_patches + self.num_tokens, embed_dim)) 114 | 115 | if drop_path_uniform is True: 116 | dpr = [drop_path_rate] * depth 117 | else: 118 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) 119 | ] # stochastic depth decay rule 120 | 121 | if ffn_layer == 'mlp': 122 | logger.info('using MLP layer as FFN') 123 | ffn_layer = Mlp 124 | elif ffn_layer == 'swiglufused' or ffn_layer == 'swiglu': 125 | logger.info('using SwiGLU layer as FFN') 126 | ffn_layer = SwiGLUFFNFused 127 | elif ffn_layer == 'identity': 128 | logger.info('using Identity layer as FFN') 129 | 130 | def f(*args, **kwargs): 131 | return nn.Identity() 132 | 133 | ffn_layer = f 134 | else: 135 | raise NotImplementedError 136 | 137 | blocks_list = [ 138 | block_fn( 139 | dim=embed_dim, 140 | num_heads=num_heads, 141 | mlp_ratio=mlp_ratio, 142 | qkv_bias=qkv_bias, 143 | proj_bias=proj_bias, 144 | ffn_bias=ffn_bias, 145 | drop_path=dpr[i], 146 | norm_layer=norm_layer, 147 | act_layer=act_layer, 148 | ffn_layer=ffn_layer, 149 | init_values=init_values, 150 | ) for i in range(depth) 151 | ] 152 | if block_chunks > 0: 153 | self.chunked_blocks = True 154 | chunked_blocks = [] 155 | chunksize = depth // block_chunks 156 | for i in range(0, depth, chunksize): 157 | # this is to keep the block index consistent 158 | # if we chunk the block list 159 | chunked_blocks.append([nn.Identity()] * i + 160 | blocks_list[i:i + chunksize]) 161 | self.blocks = nn.ModuleList( 162 | [BlockChunk(p) for p in chunked_blocks]) 163 | else: 164 | self.chunked_blocks = False 165 | self.blocks = nn.ModuleList(blocks_list) 166 | 167 | self.norm = norm_layer(embed_dim) 168 | self.head = nn.Identity() 169 | 170 | self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) 171 | 172 | self.init_weights() 173 | 174 | def init_weights(self): 175 | trunc_normal_(self.pos_embed, std=0.02) 176 | nn.init.normal_(self.cls_token, std=1e-6) 177 | named_apply(init_weights_vit_timm, self) 178 | 179 | def interpolate_pos_encoding(self, x, w, h): 180 | previous_dtype = x.dtype 181 | npatch = x.shape[1] - 1 182 | N = self.pos_embed.shape[1] - 1 183 | if npatch == N and w == h: 184 | return self.pos_embed 185 | pos_embed = self.pos_embed.float() 186 | class_pos_embed = pos_embed[:, 0] 187 | patch_pos_embed = pos_embed[:, 1:] 188 | dim = x.shape[-1] 189 | w0 = w // self.patch_size 190 | h0 = h // self.patch_size 191 | # we add a small number to avoid floating point error in the 192 | # interpolation 193 | # see discussion at https://github.com/facebookresearch/dino/issues/8 194 | w0, h0 = w0 + 0.1, h0 + 0.1 195 | 196 | patch_pos_embed = nn.functional.interpolate( 197 | patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), 198 | dim).permute(0, 3, 1, 2), 199 | scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), 200 | mode='bicubic', 201 | ) 202 | 203 | assert (int(w0) == patch_pos_embed.shape[-2] 204 | and int(h0) == patch_pos_embed.shape[-1]) 205 | patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) 206 | return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), 207 | dim=1).to(previous_dtype) 208 | 209 | def prepare_tokens_with_masks(self, x, masks=None): 210 | B, nc, w, h = x.shape 211 | x = self.patch_embed(x) 212 | if masks is not None: 213 | x = torch.where( 214 | masks.unsqueeze(-1), 215 | self.mask_token.to(x.dtype).unsqueeze(0), x) 216 | 217 | x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) 218 | x = x + self.interpolate_pos_encoding(x, w, h) 219 | 220 | return x 221 | 222 | def forward_features_list(self, x_list, masks_list): 223 | x = [ 224 | self.prepare_tokens_with_masks(x, masks) 225 | for x, masks in zip(x_list, masks_list) 226 | ] 227 | for blk in self.blocks: 228 | x = blk(x) 229 | 230 | all_x = x 231 | output = [] 232 | for x, masks in zip(all_x, masks_list): 233 | x_norm = self.norm(x) 234 | output.append({ 235 | 'x_norm_clstoken': x_norm[:, 0], 236 | 'x_norm_patchtokens': x_norm[:, 1:], 237 | 'x_prenorm': x, 238 | 'masks': masks, 239 | }) 240 | return output 241 | 242 | def forward_features(self, x, masks=None): 243 | if isinstance(x, list): 244 | return self.forward_features_list(x, masks) 245 | 246 | x = self.prepare_tokens_with_masks(x, masks) 247 | 248 | for blk in self.blocks: 249 | x = blk(x) 250 | 251 | x_norm = self.norm(x) 252 | return { 253 | 'x_norm_clstoken': x_norm[:, 0], 254 | 'x_norm_patchtokens': x_norm[:, 1:], 255 | 'x_prenorm': x, 256 | 'masks': masks, 257 | } 258 | 259 | def forward_get_last_n(self, x, n=1): 260 | x = self.prepare_tokens_with_masks(x) 261 | # If n is an int, take the n last blocks. If it's a list, take them 262 | total_block_len = len(self.blocks) 263 | blocks_to_take = ( 264 | range(total_block_len - 265 | n, total_block_len) if isinstance(n, int) else n) 266 | output = {'x': [], 'x_pre_norm': []} 267 | for i, blk in enumerate(self.blocks): 268 | x = blk(x) 269 | if i in blocks_to_take: 270 | output['x'].append(self.norm(x)) 271 | output['x_pre_norm'].append(x) 272 | 273 | return output 274 | 275 | def _get_intermediate_layers_not_chunked(self, x, n=1): 276 | x = self.prepare_tokens_with_masks(x) 277 | # If n is an int, take the n last blocks. If it's a list, take them 278 | output, total_block_len = [], len(self.blocks) 279 | blocks_to_take = ( 280 | range(total_block_len - 281 | n, total_block_len) if isinstance(n, int) else n) 282 | for i, blk in enumerate(self.blocks): 283 | x = blk(x) 284 | if i in blocks_to_take: 285 | output.append(x) 286 | assert len(output) == len( 287 | blocks_to_take 288 | ), f'only {len(output)} / {len(blocks_to_take)} blocks found' 289 | return output 290 | 291 | def _get_intermediate_layers_chunked(self, x, n=1): 292 | x = self.prepare_tokens_with_masks(x) 293 | output, i, total_block_len = [], 0, len(self.blocks[-1]) 294 | # If n is an int, take the n last blocks. If it's a list, take them 295 | blocks_to_take = ( 296 | range(total_block_len - 297 | n, total_block_len) if isinstance(n, int) else n) 298 | for block_chunk in self.blocks: 299 | for blk in block_chunk[i:]: # Passing the nn.Identity() 300 | x = blk(x) 301 | if i in blocks_to_take: 302 | output.append(x) 303 | i += 1 304 | assert len(output) == len( 305 | blocks_to_take 306 | ), f'only {len(output)} / {len(blocks_to_take)} blocks found' 307 | return output 308 | 309 | def get_intermediate_layers( 310 | self, 311 | x: torch.Tensor, 312 | n: Union[int, Sequence] = 1, # Layers or n last layers to take 313 | reshape: bool = False, 314 | return_class_token: bool = False, 315 | norm=True, 316 | ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: 317 | if self.chunked_blocks: 318 | outputs = self._get_intermediate_layers_chunked(x, n) 319 | else: 320 | outputs = self._get_intermediate_layers_not_chunked(x, n) 321 | if norm: 322 | outputs = [self.norm(out) for out in outputs] 323 | class_tokens = [out[:, 0] for out in outputs] 324 | outputs = [out[:, 1:] for out in outputs] 325 | if reshape: 326 | B, _, w, h = x.shape 327 | outputs = [ 328 | out.reshape(B, w // self.patch_size, h // self.patch_size, 329 | -1).permute(0, 3, 1, 2).contiguous() 330 | for out in outputs 331 | ] 332 | if return_class_token: 333 | return tuple(zip(outputs, class_tokens)) 334 | return tuple(outputs) 335 | 336 | def forward(self, *args, is_training=False, **kwargs): 337 | ret = self.forward_features(*args, **kwargs) 338 | if is_training: 339 | return ret 340 | else: 341 | return self.head(ret['x_norm_clstoken']) 342 | 343 | 344 | def init_weights_vit_timm(module: nn.Module, name: str = ''): 345 | """ViT weight initialization, original timm impl (for reproducibility)""" 346 | if isinstance(module, nn.Linear): 347 | trunc_normal_(module.weight, std=0.02) 348 | if module.bias is not None: 349 | nn.init.zeros_(module.bias) 350 | 351 | 352 | def vit_small(patch_size=16, **kwargs): 353 | model = DinoVisionTransformer( 354 | patch_size=patch_size, 355 | embed_dim=384, 356 | depth=12, 357 | num_heads=6, 358 | mlp_ratio=4, 359 | block_fn=partial(Block, attn_class=MemEffAttention), 360 | **kwargs, 361 | ) 362 | return model 363 | 364 | 365 | def vit_base(patch_size=16, **kwargs): 366 | model = DinoVisionTransformer( 367 | patch_size=patch_size, 368 | embed_dim=768, 369 | depth=12, 370 | num_heads=12, 371 | mlp_ratio=4, 372 | block_fn=partial(Block, attn_class=MemEffAttention), 373 | **kwargs, 374 | ) 375 | return model 376 | 377 | 378 | def vit_large(patch_size=16, **kwargs): 379 | model = DinoVisionTransformer( 380 | patch_size=patch_size, 381 | embed_dim=1024, 382 | depth=24, 383 | num_heads=16, 384 | mlp_ratio=4, 385 | block_fn=partial(Block, attn_class=MemEffAttention), 386 | **kwargs, 387 | ) 388 | return model 389 | 390 | 391 | def vit_giant2(patch_size=16, **kwargs): 392 | """Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per 393 | head 64.""" 394 | model = DinoVisionTransformer( 395 | patch_size=patch_size, 396 | embed_dim=1536, 397 | depth=40, 398 | num_heads=24, 399 | mlp_ratio=4, 400 | block_fn=partial(Block, attn_class=MemEffAttention), 401 | **kwargs, 402 | ) 403 | return model 404 | 405 | 406 | _DINOV2_BASE_URL = 'https://dl.fbaipublicfiles.com/dinov2' 407 | 408 | 409 | def _make_dinov2_model_name(arch_name: str, patch_size: int) -> str: 410 | compact_arch_name = arch_name.replace('_', '')[:4] 411 | return f'dinov2_{compact_arch_name}{patch_size}' 412 | 413 | 414 | def _make_dinov2_model( 415 | *, 416 | arch_name: str = 'vit_large', 417 | img_size: int = 518, 418 | patch_size: int = 14, 419 | init_values: float = 1.0, 420 | ffn_layer: str = 'mlp', 421 | block_chunks: int = 0, 422 | pretrained: bool = True, 423 | **kwargs, 424 | ): 425 | 426 | vits__dict__ = { 427 | 'vit_large': vit_large, 428 | 'vit_base': vit_base, 429 | 'vit_small': vit_small, 430 | } 431 | 432 | model_name = _make_dinov2_model_name(arch_name, patch_size) 433 | vit_kwargs = dict( 434 | img_size=img_size, 435 | patch_size=patch_size, 436 | init_values=init_values, 437 | ffn_layer=ffn_layer, 438 | block_chunks=block_chunks, 439 | ) 440 | vit_kwargs.update(**kwargs) 441 | model = vits__dict__[arch_name](**vit_kwargs) 442 | 443 | if pretrained: 444 | url = _DINOV2_BASE_URL + f'/{model_name}/{model_name}_pretrain.pth' 445 | state_dict = torch.hub.load_state_dict_from_url( 446 | url, model_dir='./dinov2_weights/', map_location='cpu') 447 | model.load_state_dict(state_dict, strict=False) 448 | 449 | return model 450 | 451 | 452 | def dinov2_vits14(*, pretrained: bool = True, **kwargs): 453 | """DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M 454 | dataset.""" 455 | return _make_dinov2_model( 456 | arch_name='vit_small', pretrained=pretrained, **kwargs) 457 | 458 | 459 | def dinov2_vitb14(*, pretrained: bool = True, **kwargs): 460 | """DINOv2 ViT-B/14 model pretrained on the LVD-142M dataset.""" 461 | return _make_dinov2_model( 462 | arch_name='vit_base', pretrained=pretrained, **kwargs) 463 | 464 | 465 | def dinov2_vitl14(*, pretrained: bool = True, **kwargs): 466 | """DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M 467 | dataset.""" 468 | return _make_dinov2_model( 469 | arch_name='vit_large', pretrained=pretrained, **kwargs) 470 | 471 | 472 | def dinov2_vitg14(*, pretrained: bool = True, **kwargs): 473 | """DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M 474 | dataset.""" 475 | return _make_dinov2_model( 476 | arch_name='vit_giant2', 477 | ffn_layer='swiglufused', 478 | pretrained=pretrained, 479 | **kwargs) 480 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
English | 简体中文
2 | 3 |
4 | 5 |

LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes

6 |
7 | 8 |
9 | Xiang Xu*,1    10 | Lingdong Kong*,2,3    11 | Hui Shuai4    12 | Liang Pan3    13 | Ziwei Liu5    14 | Qingshan Liu4 15 |

16 | 1NUAA    17 | 2NUS    18 | 3Shanghai AI Lab    19 | 4NJUPT    20 | 5S-Lab, NTU 21 |
22 | 23 |
24 | 25 |
26 | 27 | 28 |   29 | 30 | 31 |   32 | 33 | 34 |   35 | 36 | 37 |   38 | 39 | 40 | 41 |
42 | 43 | # About 44 | 45 | LiMoE is a framework that integrates the Mixture of Experts (MoE) paradigm into LiDAR data representation learning to synergistically combine multiple representations, such as range images, sparse voxels, and raw points. The framework consists of three stages: i) Image-to-LiDAR pretraining, which transfers prior knowledge from images to point clouds across different representations; ii) Contrastive Mixture Learning (CML), which uses MoE to adaptively activate relevant attributes from each representation and distills these mixed features into a unified 3D network; iii) Semantic Mixture Supervision (SMS), which combines semantic logits from multiple representations to boost downstream segmentation performance. 46 | 47 | 48 | 49 | ## :memo: Updates 50 | 51 | - \[2025.02\] - Our paper **LiMoE** has been accepted to **CVPR 2025**! :tada: 52 | - \[2025.01\] - Introducing the :family_man_boy_boy: **LiMoE** project! For more details, kindly refer to our [Project Page](https://ldkong.com/LiMoE) and [Preprint](https://arxiv.org/abs/2501.04004). :rocket: 53 | 54 | # Table of Content 55 | 56 | - [Installation](#gear-installation) 57 | - [Data Preparation](#hotsprings-data-preparation) 58 | - [Getting Started](#rocket-getting-started) 59 | - [Main Results](#bar_chart-main-results) 60 | - [License](#license) 61 | - [Citation](#citation) 62 | - [Acknowledgments](#acknowledgments) 63 | 64 | # :gear: Installation 65 | 66 | For details related to installation and environment setups, kindly refer to [INSTALL.md](./docs/INSTALL.md). 67 | 68 | # :hotsprings: Data Preparation 69 | 70 | Kindly refer to [DATA_PREPAER.md](./docs/DATA_PREPAER.md) for the details to prepare the datasets. 71 | 72 | # :rocket: Getting Started 73 | 74 | To learn more usage about this codebase, kindly refer to [GET_STARTED.md](./docs/GET_STARTED.md). 75 | 76 | # :bar_chart: Main Results 77 | 78 | ## Comparisons of State-of-the-Art Pretraining Methods 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 |
MethodDistillnuScenesKITTIWaymo
LP1%5%10%25%Full1%1%
Random-8.1030.3047.8456.1565.4874.6639.5039.41
SLiDRViT-S44.7041.1653.6561.4766.7174.2044.6747.57
+LiMoEViT-S45.8046.8257.5463.8568.6175.6446.8148.81
SealViT-S45.1644.2755.1362.4667.6475.5846.5148.67
SuperFlowViT-S46.4447.8159.4464.4769.2076.5447.9749.94
+LiMoEViT-S48.2049.6060.5465.6571.3977.2749.5351.42
SLiDRViT-B45.3541.6455.8362.6867.6174.9845.5048.32
+LiMoEViT-B46.5646.8958.0963.8769.0275.8747.9649.50
SealViT-B46.5945.9857.1562.7968.1875.4147.2448.91
SuperFlowViT-S47.6648.0959.6664.5269.7976.5748.4050.20
+LiMoEViT-B49.0750.2361.5166.1771.5677.8150.3051.77
SLiDRViT-L45.7042.7757.4563.2068.1375.5147.0148.60
+LiMoEViT-L47.4346.9258.4164.5469.6976.3248.2550.23
SealViT-L46.8146.2758.1463.2768.6775.6647.5550.02
SuperFlowViT-L48.0149.9560.7265.0970.0177.1949.0750.67
+LiMoEViT-L49.3551.4162.0766.6471.5977.8550.6951.93
291 | 292 | ## Domain Generalization Study 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 |
MethodScriKITTIRellis-3DSemPOSSSemSTFSynLiDARDAPS-3DSynth4D
1%10%1%10%HalfFullHalfFull1%10%HalfFull1%10%
Random23.8147.6038.4653.6046.2654.1248.0348.1519.8944.7474.3279.3820.2266.87
PPKT36.5051.6749.7154.3350.1856.0050.9254.6937.5746.4878.9084.0061.1062.41
SLiDR39.6050.4549.7554.5751.5655.3652.0154.3542.0547.8481.0085.4063.1062.67
+LiMoE41.4853.4151.2855.2153.1456.4253.1655.5143.7249.5781.7085.7664.6966.79
Seal40.6452.7751.0955.0353.2656.8953.4655.3643.5849.2681.8885.9064.5066.96
SuperFlow42.7054.0052.8355.7154.4157.3354.7256.5744.8551.3882.4386.2165.3169.43
+LiMoE43.9555.9653.7456.6755.4257.8355.6057.3145.7952.2783.2486.6866.5471.07
441 | 442 | ## Expert Activation Paths 443 | 444 | | ![paths](./docs/figs/paths.png) | 445 | | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | 446 | | Visual interpretations of the expert activation paths in Contrastive Mixture Learning (CML). The experts are #1 range view, #2 voxel, and #3 point, respectively. | 447 | 448 | ## Point-Wise Top-1 Activation 449 | 450 | | ![activation1](./docs/figs/activation1.png) | 451 | | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | 452 | | Point-wise top-1 activation path in the Semantic Mixture Supervision (SMS) stage. It highlights the most activated representation for each point during the SMS stage, illustrating how different representations contribute to semantic segmentation based on spatial and object-specific characteristics. Best viewed in colors. | 453 | 454 | ## Out-of-Distribution 3D Robustness 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 |
#MethodmCEmRRFogRainSnowBlurBeamCrossEchoSensorAvg
FullRandom112.2072.5762.9670.6555.4851.7162.0131.5659.6439.4154.18
PPKT105.6475.8764.0172.1859.0857.1763.8836.3460.5939.5756.60
SLidR106.0875.9965.4172.3156.0156.0762.8741.9461.1638.9056.83
+LiMoE101.7477.7767.9273.2557.0256.3064.7244.8161.2345.3758.83
Seal92.6383.0872.6674.3166.2266.1465.9657.4459.8739.8562.81
SuperFlow91.6783.1770.3275.7765.4161.0568.0960.0258.3650.4163.68
+LiMoE88.4383.2871.1075.9265.6663.8668.5260.7861.9150.6664.80
LPPPKT183.4478.1530.6535.4228.1229.2132.8219.5228.0120.7128.06
SLidR179.3877.1834.8838.0932.6426.4433.7320.8131.5421.4429.95
+LiMoE163.7575.4937.2943.4136.0438.3340.6622.4637.6125.3835.15
Seal166.1875.3837.3342.7729.9337.7340.3220.3137.7324.9433.88
SuperFlow161.7875.5237.5943.4237.6039.5741.4023.6438.0326.6935.99
+LiMoE155.7778.2340.3545.2839.1442.1044.2127.3339.2029.4938.39
657 | 658 | ## Cosine Similarity 659 | 660 | | ![heatmaps](./docs/figs/heatmaps.png) | 661 | | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | 662 | | Cosine similarity between learned features of a query point (denoted as the red dot) and: (1) the features of the image of the same scene (the first row); and (2) the features of the LiDAR points projected onto the image (the second row). Best viewed in colors. | 663 | 664 | ## Qualitative Assessment 665 | 666 | | ![qualitative1](./docs/figs/qualitative1.png) | 667 | | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | 668 | | Qualitative assessments of state-of-the-art pretraining methods, pretrained on nuScenes and fine-tuned on SemanticKITTI with 1% annotations. The error maps depict correct and incorrect predictions in gray and red, respectively. Best viewed in colors. | 669 | 670 | # License 671 | 672 | This work is under the [Apache License Version 2.0](https://www.apache.org/licenses/LICENSE-2.0), while some specific implementations in this codebase might be with other licenses. 673 | 674 | Kindly refer to [LICENSE.md](./docs/LICENSE.md) for a more careful check, if you are using our code for commercial matters. 675 | 676 | # Citation 677 | 678 | If you find this work helpful for your research, please kindly consider citing our paper: 679 | 680 | ```bibtex 681 | @inproceedings{xu2025limoe, 682 | title = {LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes}, 683 | author = {Xu, Xiang and Kong, Lingdong and Shuai, Hui and Pan, Liang and Liu, Ziwei and Liu, Qingshan}, 684 | booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 685 | pages = {27368--27379}, 686 | year = {2025} 687 | } 688 | ``` 689 | 690 | # Acknowledgments 691 | 692 | This work is developed based on the [MMDetection3D](https://github.com/open-mmlab/mmdetection3d) codebase. 693 | 694 | >
695 | > MMDetection3D is an open-source object detection toolbox based on PyTorch, towards the next-generation platform for general 3D perception. It is a part of the OpenMMLab project developed by MMLab. 696 | 697 | We acknowledge the use of the following public resources during the couuse of this work: 1[nuScenes](https://www.nuscenes.org/nuscenes), 2[nuScenes-devkit](https://github.com/nutonomy/nuscenes-devkit), 3[SemanticKITTI](http://www.semantic-kitti.org), 4[SemanticKITTI-API](https://github.com/PRBonn/semantic-kitti-api), , 5[WaymoOpenDataset](https://waymo.com/open), 6[Synth4D](https://github.com/saltoricristiano/gipso-sfouda), 7[ScribbleKITTI](https://github.com/ouenal/scribblekitti), 8[RELLIS-3D](https://github.com/unmannedlab/RELLIS-3D), 9[SemanticPOSS](http://www.poss.pku.edu.cn/semanticposs.html), 10[SemanticSTF](https://github.com/xiaoaoran/SemanticSTF), 11[SynthLiDAR](https://github.com/xiaoaoran/SynLiDAR), 12[DAPS-3D](https://github.com/subake/DAPS3D), 13[Robo3D](https://github.com/ldkong1205/Robo3D), 14[SLidR](https://github.com/valeoai/SLidR), 15[DINOv2](https://github.com/facebookresearch/dinov2), 16[FRNet](https://github.com/Xiangxu-0103/FRNet), 17[SuperFlow](https://github.com/Xiangxu-0103/SuperFlow), 18[torchsparse](https://github.com/mit-han-lab/torchsparse), 19[Conv-LoRA](https://github.com/autogluon/autogluon), 20[MoE-LLaVA](https://github.com/PKU-YuanGroup/MoE-LLaVA). :heart_decoration: 698 | --------------------------------------------------------------------------------