├── tools ├── data_converter │ ├── __init__.py │ └── create_gt_database.py ├── create_data.sh ├── dist_train.sh ├── dist_test.sh ├── test_speed.py ├── visual_utils │ ├── open3d_vis_utils.py │ └── visualize_utils.py ├── create_data.py ├── test.py └── train.py ├── projects ├── mmdet3d_plugin │ ├── core │ │ ├── __init__.py │ │ └── bbox │ │ │ ├── coders │ │ │ ├── __init__.py │ │ │ └── multi_task_bbox_coder.py │ │ │ ├── assigners │ │ │ ├── __init__.py │ │ │ └── hungarian_assigner_3d.py │ │ │ ├── match_costs │ │ │ ├── __init__.py │ │ │ └── match_cost.py │ │ │ └── util.py │ ├── mmcv_custom │ │ ├── ops │ │ │ ├── __init__.py │ │ │ └── voxel │ │ │ │ ├── __init__.py │ │ │ │ └── spconv_voxelize.py │ │ ├── runner │ │ │ ├── __init__.py │ │ │ └── hooks │ │ │ │ ├── __init__.py │ │ │ │ ├── optimizer.py │ │ │ │ ├── freeze_weight.py │ │ │ │ └── drop_gt_sampling.py │ │ └── __init__.py │ ├── models │ │ ├── necks │ │ │ ├── __init__.py │ │ │ └── cp_fpn.py │ │ ├── backbones │ │ │ ├── __init__.py │ │ │ └── vovnet.py │ │ ├── __init__.py │ │ ├── detectors │ │ │ ├── __init__.py │ │ │ ├── meformer.py │ │ │ └── mome.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── grid_mask.py │ │ │ ├── moad_transformer.py │ │ │ ├── pme_transformer.py │ │ │ └── attention.py │ │ └── dense_heads │ │ │ ├── __init__.py │ │ │ └── separate_task_head.py │ ├── datasets │ │ ├── pipelines │ │ │ ├── __init__.py │ │ │ └── dbsampler.py │ │ ├── __init__.py │ │ └── custom_nuscenes_dataset.py │ └── __init__.py └── configs │ ├── meformer_voxel0075_vov_1600x640_cbgs.py │ └── mome │ └── mome.py ├── assets └── ov.png ├── requirements.txt ├── docs ├── prepare_dataset.md ├── train_eval.md └── install.md ├── LICENSE ├── .gitignore └── README.md /tools/data_converter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/ov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/konyul/MoME/HEAD/assets/ov.png -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .voxel import * 2 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/__init__.py: -------------------------------------------------------------------------------- 1 | from .hooks import * 2 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | from .ops import * 2 | from .runner import * 3 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .cp_fpn import CPFPN 2 | 3 | __all__ = ['CPFPN'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/voxel/__init__.py: -------------------------------------------------------------------------------- 1 | from .spconv_voxelize import SPConvVoxelization 2 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .vovnet import VoVNet 2 | 3 | __all__ = ['VoVNet', ] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .dbsampler import UnifiedDataBaseSampler 2 | from .transform_3d import * -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .custom_nuscenes_dataset import CustomNuScenesDataset 2 | from .pipelines import * 3 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/__init__.py: -------------------------------------------------------------------------------- 1 | from .multi_task_bbox_coder import MultiTaskBBoxCoder 2 | 3 | __all__ = ['MultiTaskBBoxCoder'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/__init__.py: -------------------------------------------------------------------------------- 1 | from .hungarian_assigner_3d import HungarianAssigner3D 2 | 3 | __all__ = ['HungarianAssigner3D'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * 2 | from .dense_heads import * 3 | from .detectors import * 4 | from .necks import * 5 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .meformer import MEFormerDetector 2 | from .mome import MoME 3 | 4 | __all__ = ['MEFormerDetector', 'MoME'] 5 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .moad_transformer import * 2 | from .petr_transformer import * 3 | from .pme_transformer import * 4 | from .multi_expert import * -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mmdet==2.24.0 2 | mmsegmentation==0.29.1 3 | mmdet3d==1.0.0rc5 4 | spconv-cu111 5 | flash-attn==0.2.2 6 | numpy==1.23.5 7 | setuptools==59.5.0 8 | yapf==0.40.1 9 | -------------------------------------------------------------------------------- /docs/prepare_dataset.md: -------------------------------------------------------------------------------- 1 | #### Prepare data 2 | Run [create_data.sh](https://github.com/hanchaa/MEFormer/blob/main/tools/create_data.sh) script. 3 | ```shell 4 | bash tools/create_data.sh 5 | ``` 6 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | from .drop_gt_sampling import DropGTSamplingHook 2 | from .freeze_weight import FreezeWeight 3 | from .optimizer import CustomFp16OptimizerHook 4 | -------------------------------------------------------------------------------- /tools/create_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH=`pwd`:$PYTHONPATH 5 | 6 | python tools/create_data.py nuscenes --root-path ./data/nuscenes/ --out-dir ./data/nuscenes --extra-tag nuscenes 7 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from .core.bbox.assigners import * 2 | from .core.bbox.coders import * 3 | from .core.bbox.match_costs import BBox3DL1Cost 4 | from .datasets import * 5 | from .mmcv_custom import * 6 | from .models import * 7 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .meformer_head import MEFormerHead 2 | from .separate_task_head import SeparateTaskHead 3 | from .med import MultiExpertDecoding 4 | 5 | __all__ = ['SeparateTaskHead', 'MEFormerHead', 'MultiExpertDecoding'] 6 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py: -------------------------------------------------------------------------------- 1 | from mmdet.core.bbox.match_costs import build_match_cost 2 | 3 | from .match_cost import BBox3DL1Cost, BBoxBEVL1Cost, IoU3DCost 4 | 5 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'BBoxBEVL1Cost', 'IoU3DCost'] 6 | -------------------------------------------------------------------------------- /docs/train_eval.md: -------------------------------------------------------------------------------- 1 | ## Train & Inference 2 | #### Train 3 | 4 | Train 1st Stage 5 | 6 | ```shell 7 | tools/dist_train.sh ./projects/configs/moad_voxel0075_vov_1600x640_cbgs.py 8 8 | ``` 9 | 10 | Train 2nd Stage 11 | ```shell 12 | tools/dist_train.sh ./projects/configs/mome/mome.py 4 13 | ``` 14 | 15 | #### Inference 16 | ```shell 17 | tools/dist_test.sh ./projects/configs/mome/mome.py $path_to_weight$ $num_gpus --eval bbox 18 | ``` 19 | -------------------------------------------------------------------------------- /tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --launcher pytorch ${@:3} 20 | -------------------------------------------------------------------------------- /tools/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-29500} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 12 | python -m torch.distributed.launch \ 13 | --nnodes=$NNODES \ 14 | --node_rank=$NODE_RANK \ 15 | --master_addr=$MASTER_ADDR \ 16 | --nproc_per_node=$GPUS \ 17 | --master_port=$PORT \ 18 | $(dirname "$0")/test.py \ 19 | $CONFIG \ 20 | $CHECKPOINT \ 21 | --launcher pytorch \ 22 | ${@:4} 23 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/optimizer.py: -------------------------------------------------------------------------------- 1 | from mmcv.runner.hooks import HOOKS 2 | from mmcv.runner.hooks.optimizer import Fp16OptimizerHook 3 | 4 | 5 | @HOOKS.register_module() 6 | class CustomFp16OptimizerHook(Fp16OptimizerHook): 7 | 8 | def __init__(self, 9 | custom_fp16={}, 10 | *args, 11 | **kwargs): 12 | super(CustomFp16OptimizerHook, self).__init__(*args, **kwargs) 13 | self.custom_fp16 = custom_fp16 14 | 15 | def before_run(self, runner) -> None: 16 | super().before_run(runner) 17 | for module_name, v in self.custom_fp16.items(): 18 | runner.model.module._modules[module_name].fp16_enabled = v 19 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Step-by-step installation instructions 2 | 3 | Use MoME with Docker 4 | 5 | **a. We provide Docker Image.** 6 | ```shell 7 | docker pull kyparkk/mome:python3.8_torch1.11.0_cu113 8 | docker run --gpus all --shm-size=512g -it -v {DATA_DIR}:{DATA_DIR} kyparkk/mome:python3.8_torch1.11.0_cu113 /bin/bash 9 | ``` 10 | 11 | **b. Clone MoME.** 12 | ``` 13 | git clone https://github.com/konyul/MoME.git 14 | ``` 15 | 16 | **c. Install requirements** 17 | ```shell 18 | cd /path/to/MoME 19 | pip install -r requirements.txt 20 | 21 | ``` 22 | 23 | **c. Download pre-trained weights** 24 | Download the pretrained weight of the image backbone from https://github.com/hanchaa/MEFormer 25 | ```shell 26 | MoME 27 | ├─ ckpts 28 | │ ├─ fcos3d_vovnet_imgbackbone-remapped.pth 29 | │ └─ nuim_r50.pth 30 | ├─ figures 31 | ├─ projects 32 | └─ tools 33 | 34 | ``` -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/freeze_weight.py: -------------------------------------------------------------------------------- 1 | from mmcv.runner.hooks import HOOKS, Hook 2 | 3 | 4 | @HOOKS.register_module() 5 | class FreezeWeight(Hook): 6 | def __init__(self, finetune_weight): 7 | super().__init__() 8 | self.finetune_weight = finetune_weight 9 | 10 | def before_run(self, runner): 11 | if hasattr(runner.model, "module"): 12 | model = runner.model.module 13 | else: 14 | model = runner.model 15 | 16 | freezed = [] 17 | not_freezed = [] 18 | for name, p in model.named_parameters(): 19 | flag = False 20 | for f in self.finetune_weight: 21 | if name.startswith(f) and p.requires_grad: 22 | flag = True 23 | not_freezed.append(name) 24 | 25 | if not flag: 26 | p.requires_grad = False 27 | freezed.append(name) 28 | 29 | runner.logger.info(f"Freezed parameters: {', '.join(freezed)}") 30 | runner.logger.info(f"Learned parameters: {', '.join(not_freezed)}") 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2024 JuHan Cha 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/drop_gt_sampling.py: -------------------------------------------------------------------------------- 1 | from mmcv.runner.hooks import HOOKS, Hook 2 | 3 | 4 | @HOOKS.register_module() 5 | class DropGTSamplingHook(Hook): 6 | 7 | def __init__(self, 8 | drop_epoch, 9 | pipeline_name="UnifiedObjectSample", 10 | *args, 11 | **kwargs): 12 | super(DropGTSamplingHook, self).__init__(*args, **kwargs) 13 | self.drop_epoch = drop_epoch 14 | self.pipeline_name = pipeline_name 15 | self.dropped = False 16 | 17 | def before_train_epoch(self, runner) -> None: 18 | if not self.dropped and runner.epoch >= self.drop_epoch: 19 | dataset = runner.data_loader.dataset.dataset 20 | if hasattr(dataset, 'datasets'): 21 | datasets = dataset.datasets 22 | else: 23 | datasets = [dataset] 24 | 25 | for d in datasets: 26 | pipeline = d.pipeline.transforms 27 | index = 0 28 | dropped = False 29 | 30 | for i, p in enumerate(pipeline): 31 | if p.__class__.__name__ == self.pipeline_name: 32 | index = i 33 | dropped = True 34 | runner.logger.info(f"{self.pipeline_name} is dropped after {self.drop_epoch} epoch training!") 35 | break 36 | 37 | if dropped: 38 | pipeline.pop(index) 39 | self.dropped = dropped 40 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST 3 | 4 | 5 | @MATCH_COST.register_module() 6 | class BBox3DL1Cost(object): 7 | """BBox3DL1Cost. 8 | Args: 9 | weight (int | float, optional): loss_weight 10 | """ 11 | 12 | def __init__(self, weight=1.): 13 | self.weight = weight 14 | 15 | def __call__(self, bbox_pred, gt_bboxes): 16 | """ 17 | Args: 18 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 19 | (cx, cy, w, h), which are all in range [0, 1]. Shape 20 | [num_query, 4]. 21 | gt_bboxes (Tensor): Ground truth boxes with normalized 22 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 23 | Returns: 24 | torch.Tensor: bbox_cost value with weight 25 | """ 26 | bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) 27 | return bbox_cost * self.weight 28 | 29 | 30 | @MATCH_COST.register_module() 31 | class BBoxBEVL1Cost(object): 32 | def __init__(self, weight): 33 | self.weight = weight 34 | 35 | def __call__(self, bboxes, gt_bboxes, pc_range): 36 | pc_start = bboxes.new(pc_range[0:2]) 37 | pc_range = bboxes.new(pc_range[3:5]) - bboxes.new(pc_range[0:2]) 38 | # normalize the box center to [0, 1] 39 | normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range 40 | normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range 41 | reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1) 42 | return reg_cost * self.weight 43 | 44 | 45 | @MATCH_COST.register_module() 46 | class IoU3DCost(object): 47 | def __init__(self, weight): 48 | self.weight = weight 49 | 50 | def __call__(self, iou): 51 | iou_cost = - iou 52 | return iou_cost * self.weight 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.ipynb 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | tmp/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | mmdetection3d/ 31 | mmdetection3d 32 | mmdet3d 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | hostfile.txt 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | 113 | # cython generated cpp 114 | data 115 | ckpts 116 | .vscode 117 | .idea 118 | 119 | # custom 120 | nuscenes_gt_database 121 | nuscenes_unified_gt_database 122 | work_dirs 123 | *.pkl 124 | *.pkl.json 125 | *.log.json 126 | work_dirs/ 127 | exps/ 128 | *~ 129 | mmdet3d/.mim 130 | 131 | # Pytorch 132 | *.pth 133 | 134 | # demo 135 | # *.jpg 136 | # *.png 137 | data/s3dis/Stanford3dDataset_v1.2_Aligned_Version/ 138 | data/scannet/scans/ 139 | data/sunrgbd/OFFICIAL_SUNRGBD/ 140 | *.obj 141 | *.ply 142 | 143 | # Waymo evaluation 144 | mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main 145 | 146 | .DS_Store 147 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def normalize_bbox(bboxes, pc_range=None): 5 | cx = bboxes[..., 0:1] 6 | cy = bboxes[..., 1:2] 7 | cz = bboxes[..., 2:3] 8 | w = bboxes[..., 3:4].log() 9 | l = bboxes[..., 4:5].log() 10 | h = bboxes[..., 5:6].log() 11 | 12 | rot = bboxes[..., 6:7] 13 | if bboxes.size(-1) > 7: 14 | vx = bboxes[..., 7:8] 15 | vy = bboxes[..., 8:9] 16 | normalized_bboxes = torch.cat( 17 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 18 | ) 19 | else: 20 | normalized_bboxes = torch.cat( 21 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 22 | ) 23 | return normalized_bboxes 24 | 25 | 26 | def denormalize_bbox(normalized_bboxes, pc_range=None): 27 | # rotation 28 | rot_sine = normalized_bboxes[..., 6:7] 29 | 30 | rot_cosine = normalized_bboxes[..., 7:8] 31 | rot = torch.atan2(rot_sine, rot_cosine) 32 | 33 | # center in the bev 34 | cx = normalized_bboxes[..., 0:1] 35 | cy = normalized_bboxes[..., 1:2] 36 | cz = normalized_bboxes[..., 4:5] 37 | 38 | # size 39 | w = normalized_bboxes[..., 2:3] 40 | l = normalized_bboxes[..., 3:4] 41 | h = normalized_bboxes[..., 5:6] 42 | 43 | w = w.exp() 44 | l = l.exp() 45 | h = h.exp() 46 | 47 | if normalized_bboxes.size(-1) > 8: 48 | # velocity 49 | vx = normalized_bboxes[..., 8:9] 50 | vy = normalized_bboxes[..., 9:10] 51 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) 52 | else: 53 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) 54 | return denormalized_bboxes 55 | 56 | 57 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical): 58 | """Map bboxes from testing scale to original image scale. 59 | 60 | Args: 61 | bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. 62 | scale_factor (float): Scale factor. 63 | flip_horizontal (bool): Whether to flip horizontally. 64 | flip_vertical (bool): Whether to flip vertically. 65 | 66 | Returns: 67 | :obj:`BaseInstance3DBoxes`: Boxes mapped back. 68 | """ 69 | new_bboxes = bboxes.clone() 70 | if flip_horizontal: 71 | new_bboxes.flip('horizontal') 72 | if flip_vertical: 73 | new_bboxes.flip('vertical') 74 | new_bboxes.scale(1 / scale_factor) 75 | new_bboxes.rotate(-rot_degree) 76 | 77 | return new_bboxes 78 | -------------------------------------------------------------------------------- /tools/test_speed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 2 | 3 | import importlib 4 | import os 5 | import time 6 | 7 | import torch 8 | from mmcv import Config 9 | from mmcv.parallel import MMDataParallel 10 | from mmcv.runner import load_checkpoint, wrap_fp16_model 11 | from mmdet3d.datasets import build_dataloader, build_dataset 12 | from mmdet3d.models import build_detector 13 | 14 | 15 | class Wrapper: 16 | 17 | def __init__(self, 18 | cfg, 19 | checkpoint=None) -> None: 20 | self.cfg = Config.fromfile(cfg) 21 | self.save_dir = './tmp' 22 | self.init() 23 | self.model = self._build_model(checkpoint) 24 | 25 | if self.cfg.get('fp16', None) is not None: 26 | wrap_fp16_model(self.model) 27 | if cfg.get('optimizer_config', None) is not None and cfg.optimizer_config['type'] == 'CustomFp16OptimizerHook': 28 | wrap_fp16_model(self.model) 29 | for module_name, v in cfg.optimizer_config['custom_fp16'].items(): 30 | self.model._modules[module_name].fp16_enabled = v 31 | 32 | self.dataset = self._build_dataset() 33 | 34 | def init(self): 35 | self.cfg.model.pretrained = None 36 | self.cfg.data.test.test_mode = True 37 | plugin_dir = self.cfg.plugin_dir 38 | _module_dir = os.path.dirname(plugin_dir) 39 | _module_dir = _module_dir.split('/') 40 | _module_path = _module_dir[0] 41 | for m in _module_dir[1:]: 42 | _module_path = _module_path + '.' + m 43 | print(_module_path) 44 | plg_lib = importlib.import_module(_module_path) 45 | 46 | def _build_model(self, checkpoint=None): 47 | model = build_detector(self.cfg.model, test_cfg=self.cfg.get('test_cfg')) 48 | if checkpoint: 49 | load_checkpoint(model, checkpoint, map_location='cpu') 50 | model = MMDataParallel(model, device_ids=[0]) 51 | model.eval() 52 | return model 53 | 54 | def _build_dataset(self): 55 | dataset = build_dataset(self.cfg.data.val) 56 | return dataset 57 | 58 | def test_speed(self, num_iters=100): 59 | data_loader = build_dataloader( 60 | self.dataset, 61 | samples_per_gpu=1, 62 | workers_per_gpu=self.cfg.data.workers_per_gpu, 63 | dist=False, 64 | shuffle=False) 65 | loader = iter(data_loader) 66 | total_time = 0 67 | warmup_iter = 10 68 | 69 | with torch.no_grad(): 70 | for _ in range(num_iters): 71 | data = next(loader) 72 | t1 = time.time() 73 | self.model(**data, return_loss=False) 74 | 75 | if _ >= warmup_iter: 76 | total_time += time.time() - t1 77 | 78 | print(f'Average time: {total_time / (num_iters - warmup_iter)}') 79 | 80 | 81 | if __name__ == '__main__': 82 | wrapper = Wrapper( 83 | cfg='your path to config file', 84 | ) 85 | wrapper.test_speed() 86 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/voxel/spconv_voxelize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 2 | 3 | import numpy as np 4 | import torch 5 | from spconv.pytorch.utils import PointToVoxel # spconv-cu111 2.1.21 6 | from torch import nn 7 | from torch.nn.modules.utils import _pair 8 | 9 | 10 | class SPConvVoxelization(nn.Module): 11 | def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels, num_point_features, 12 | device=torch.device("cuda")): 13 | super().__init__() 14 | assert len(voxel_size) == 3 15 | assert len(point_cloud_range) == 6 16 | self.voxel_size = np.array(voxel_size) 17 | self.point_cloud_range = np.array(point_cloud_range) 18 | self.max_num_points = max_num_points 19 | self.num_point_features = num_point_features 20 | self.device = device 21 | if isinstance(max_voxels, tuple): 22 | self.max_voxels = max_voxels 23 | else: 24 | self.max_voxels = _pair(max_voxels) 25 | self.voxel_generator = PointToVoxel( 26 | vsize_xyz=voxel_size, 27 | coors_range_xyz=point_cloud_range, 28 | max_num_points_per_voxel=max_num_points, 29 | max_num_voxels=self.max_voxels[0], 30 | num_point_features=num_point_features, 31 | device=device, 32 | ) 33 | grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(voxel_size) 34 | self.grid_size = np.round(grid_size).astype(np.int64) 35 | 36 | def train(self, mode: bool = True): 37 | if mode: 38 | self.voxel_generator = PointToVoxel( 39 | vsize_xyz=self.voxel_size.tolist(), 40 | coors_range_xyz=self.point_cloud_range.tolist(), 41 | max_num_points_per_voxel=self.max_num_points, 42 | max_num_voxels=self.max_voxels[0], 43 | num_point_features=self.num_point_features, 44 | device=self.device, 45 | ) 46 | else: 47 | self.voxel_generator = PointToVoxel( 48 | vsize_xyz=self.voxel_size.tolist(), 49 | coors_range_xyz=self.point_cloud_range.tolist(), 50 | max_num_points_per_voxel=self.max_num_points, 51 | max_num_voxels=self.max_voxels[1], 52 | num_point_features=self.num_point_features, 53 | device=self.device, 54 | ) 55 | 56 | return super().train(mode) 57 | 58 | def forward(self, points): 59 | voxel_output = self.voxel_generator(points) 60 | voxels, coordinates, num_points = voxel_output 61 | return torch.clone(voxels), torch.clone(coordinates), torch.clone(num_points) 62 | 63 | def __repr__(self): 64 | tmpstr = self.__class__.__name__ + '(' 65 | tmpstr += 'voxel_size=' + str(self.voxel_size) 66 | tmpstr += ', point_cloud_range=' + str(self.point_cloud_range) 67 | tmpstr += ', max_num_points=' + str(self.max_num_points) 68 | tmpstr += ', max_voxels=' + str(self.max_voxels) 69 | tmpstr += ', num_point_features=' + str(self.num_point_features) 70 | tmpstr += ')' 71 | return tmpstr 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

[CVPR2025] MoME

3 |

Resilient Sensor Fusion under Adverse Sensor Failures via Multi-Modal Expert Fusion

4 | 5 | Konyul Park1 \*, Yecheol Kim2,3 \*, Daehun Kim2, Jun Won Choi1 \** 6 | 7 | 1 Seoul National University, Korea 2 Hanyang University, Korea, 3 LG AI Research, Korea 8 | 9 | (\*) equal contribution, (\**) corresponding author. 10 | 11 | ArXiv Preprint ([arXiv 2407.13517](https://arxiv.org/abs/2503.19776)) 12 |
13 | 14 | ![overall](assets/ov.png "framework") 15 | 16 | ## Introduction 17 | In this study, we introduce an efficient and robust LiDAR-camera 3D object detector, referred to as MoME, which can achieve robust performance through a mixture of experts approach. Our MoME fully decouples modality dependencies using three parallel expert decoders, which use camera features, LiDAR features, or a combination of both to decode object queries, respectively. We propose Multi-Expert Decoding (MED) framework, where each query is decoded selectively using one of three expert decoders. MoME utilizes an Adaptive Query Router (AQR) to select the most appropriate expert decoder for each query based on the quality of camera and LiDAR features. This ensures that each query is processed by the best-suited expert, resulting in robust performance across diverse sensor failure scenarios. We evaluated the performance of MoME on the nuScenes-R benchmark. Our MoME achieved state-of-the-art performance in extreme weather and sensor failure conditions, significantly outperforming the existing models across various sensor failure scenarios. 18 | 19 | ## Qualitative results (NDS) on nuScenes and nuScenes-R dataset 20 | 21 | 22 | |Method|Training Schedule|Clean|Beam Reduction|LiDAR Drop|Limited FOV|Object Failure|View Drop|Occlusion| config | weight | 23 | |:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----| 24 | | | | | 4 beams | all | ±60 | 0.5 | all | | | | 25 | | MoME | 2 Epochs | 73.6 | 63.0 | 48.2 | 58.3 | 71.0 | 69.5 | 70.5 | [config](https://github.com/konyul/MoME/blob/main/projects/configs/mome/mome.py) | [weight](https://drive.google.com/file/d/1dFwy-eUrTMVJkoufT58rwvqis5lfOoEH/view?usp=sharing) | 26 | 27 | ## Notes 28 | We Evaluate MoME on [nuScenes-R](https://github.com/ADLab-AutoDrive/lidar-camera-robust-benchmark) and [nuScenes-C](https://github.com/thu-ml/3D_Corruptions_AD) 29 | 30 | ## Getting Started 31 | - [Installation](docs/install.md) 32 | - [Prepare Dataset](docs/prepare_dataset.md) 33 | - [Train and Eval](docs/train_eval.md) 34 | 35 | ## Acknowledgements 36 | 37 | MoME is based on [MEFormer](https://github.com/hanchaa/MEFormer). It is also greatly inspired by the following outstanding contributions to the open-source community: [mmdetection3d](https://github.com/open-mmlab/mmdetection3d), [CMT](https://github.com/junjie18/CMT). 38 | 39 | ## Citation 40 | If you find MoME is useful in your research or applications, please consider giving us a star 🌟 and citing it by the following BibTeX entry. 41 | ```bibtex 42 | @article{MoME, 43 | title={Resilient Sensor Fusion under Adverse Sensor Failures via Multi-Modal Expert Fusion}, 44 | author={Park, Konyul and Kim, Yecheol and Kim, Daehun and Choi, Jun Won}, 45 | journal={arXiv preprint arXiv:2503.19776}, 46 | year={2025} 47 | } 48 | ``` 49 | -------------------------------------------------------------------------------- /tools/visual_utils/open3d_vis_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Open3d visualization tool box 3 | Written by Jihan YANG 4 | All rights preserved from 2021 - present. 5 | """ 6 | import matplotlib 7 | import numpy as np 8 | import open3d 9 | import torch 10 | 11 | box_colormap = [ 12 | [1, 1, 1], 13 | [0, 1, 0], 14 | [0, 1, 1], 15 | [1, 1, 0], 16 | ] 17 | 18 | 19 | def get_coor_colors(obj_labels): 20 | """ 21 | Args: 22 | obj_labels: 1 is ground, labels > 1 indicates different instance cluster 23 | 24 | Returns: 25 | rgb: [N, 3]. color for each point. 26 | """ 27 | colors = matplotlib.colors.XKCD_COLORS.values() 28 | max_color_num = obj_labels.max() 29 | 30 | color_list = list(colors)[:max_color_num + 1] 31 | colors_rgba = [matplotlib.colors.to_rgba_array(color) for color in color_list] 32 | label_rgba = np.array(colors_rgba)[obj_labels] 33 | label_rgba = label_rgba.squeeze()[:, :3] 34 | 35 | return label_rgba 36 | 37 | 38 | def draw_scenes(points, gt_boxes=None, ref_boxes=None, ref_labels=None, ref_scores=None, point_colors=None, 39 | draw_origin=True): 40 | if isinstance(points, torch.Tensor): 41 | points = points.cpu().numpy() 42 | if isinstance(gt_boxes, torch.Tensor): 43 | gt_boxes = gt_boxes.cpu().numpy() 44 | if isinstance(ref_boxes, torch.Tensor): 45 | ref_boxes = ref_boxes.cpu().numpy() 46 | 47 | vis = open3d.visualization.Visualizer() 48 | vis.create_window() 49 | 50 | vis.get_render_option().point_size = 1.0 51 | vis.get_render_option().background_color = np.zeros(3) 52 | 53 | # draw origin 54 | if draw_origin: 55 | axis_pcd = open3d.geometry.TriangleMesh.create_coordinate_frame(size=1.0, origin=[0, 0, 0]) 56 | vis.add_geometry(axis_pcd) 57 | 58 | pts = open3d.geometry.PointCloud() 59 | pts.points = open3d.utility.Vector3dVector(points[:, :3]) 60 | 61 | vis.add_geometry(pts) 62 | if point_colors is None: 63 | pts.colors = open3d.utility.Vector3dVector(np.ones((points.shape[0], 3))) 64 | else: 65 | pts.colors = open3d.utility.Vector3dVector(point_colors) 66 | 67 | if gt_boxes is not None: 68 | vis = draw_box(vis, gt_boxes, (0, 0, 1)) 69 | 70 | if ref_boxes is not None: 71 | vis = draw_box(vis, ref_boxes, (0, 1, 0), ref_labels, ref_scores) 72 | 73 | vis.run() 74 | vis.destroy_window() 75 | 76 | 77 | def translate_boxes_to_open3d_instance(gt_boxes): 78 | """ 79 | 4-------- 6 80 | /| /| 81 | 5 -------- 3 . 82 | | | | | 83 | . 7 -------- 1 84 | |/ |/ 85 | 2 -------- 0 86 | """ 87 | center = gt_boxes[0:3] 88 | lwh = gt_boxes[3:6] 89 | axis_angles = np.array([0, 0, gt_boxes[6] + 1e-10]) 90 | rot = open3d.geometry.get_rotation_matrix_from_axis_angle(axis_angles) 91 | box3d = open3d.geometry.OrientedBoundingBox(center, rot, lwh) 92 | 93 | line_set = open3d.geometry.LineSet.create_from_oriented_bounding_box(box3d) 94 | 95 | # import ipdb; ipdb.set_trace(context=20) 96 | lines = np.asarray(line_set.lines) 97 | lines = np.concatenate([lines, np.array([[1, 4], [7, 6]])], axis=0) 98 | 99 | line_set.lines = open3d.utility.Vector2iVector(lines) 100 | 101 | return line_set, box3d 102 | 103 | 104 | def draw_box(vis, gt_boxes, color=(0, 1, 0), ref_labels=None, score=None): 105 | for i in range(gt_boxes.shape[0]): 106 | line_set, box3d = translate_boxes_to_open3d_instance(gt_boxes[i]) 107 | if ref_labels is None: 108 | line_set.paint_uniform_color(color) 109 | else: 110 | line_set.paint_uniform_color(box_colormap[ref_labels[i]]) 111 | 112 | vis.add_geometry(line_set) 113 | 114 | # if score is not None: 115 | # corners = box3d.get_box_points() 116 | # vis.add_3d_label(corners[5], '%.2f' % score[i]) 117 | return vis 118 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/custom_nuscenes_dataset.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import numpy as np 12 | from mmdet.datasets import DATASETS 13 | from mmdet3d.datasets import NuScenesDataset 14 | 15 | 16 | @DATASETS.register_module() 17 | class CustomNuScenesDataset(NuScenesDataset): 18 | r"""NuScenes Dataset. 19 | 20 | This datset only add camera intrinsics and extrinsics to the results. 21 | """ 22 | 23 | def __init__(self, *args, return_gt_info=False, **kwargs): 24 | super(CustomNuScenesDataset, self).__init__(*args, **kwargs) 25 | self.return_gt_info = return_gt_info 26 | 27 | def get_data_info(self, index): 28 | """Get data info according to the given index. 29 | 30 | Args: 31 | index (int): Index of the sample data to get. 32 | 33 | Returns: 34 | dict: Data information that will be passed to the data \ 35 | preprocessing pipelines. It includes the following keys: 36 | 37 | - sample_idx (str): Sample index. 38 | - pts_filename (str): Filename of point clouds. 39 | - sweeps (list[dict]): Infos of sweeps. 40 | - timestamp (float): Sample timestamp. 41 | - img_filename (str, optional): Image filename. 42 | - lidar2img (list[np.ndarray], optional): Transformations \ 43 | from lidar to different cameras. 44 | - ann_info (dict): Annotation info. 45 | """ 46 | info = self.data_infos[index] 47 | # standard protocal modified from SECOND.Pytorch 48 | input_dict = dict( 49 | sample_idx=info['token'], 50 | pts_filename=info['lidar_path'], 51 | sweeps=info['sweeps'], 52 | timestamp=info['timestamp'] / 1e6, 53 | img_sweeps=None if 'img_sweeps' not in info else info['img_sweeps'], 54 | radar_info=None if 'radars' not in info else info['radars'] 55 | ) 56 | 57 | if self.return_gt_info: 58 | input_dict['info'] = info 59 | 60 | if self.modality['use_camera']: 61 | image_paths = [] 62 | lidar2img_rts = [] 63 | lidar2cam_rts = [] 64 | cam_intrinsics = [] 65 | img_timestamp = [] 66 | for cam_type, cam_info in info['cams'].items(): 67 | img_timestamp.append(cam_info['timestamp'] / 1e6) 68 | image_paths.append(cam_info['data_path']) 69 | # obtain lidar to image transformation matrix 70 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) 71 | lidar2cam_t = cam_info[ 72 | 'sensor2lidar_translation'] @ lidar2cam_r.T 73 | lidar2cam_rt = np.eye(4) 74 | lidar2cam_rt[:3, :3] = lidar2cam_r.T 75 | lidar2cam_rt[3, :3] = -lidar2cam_t 76 | intrinsic = cam_info['cam_intrinsic'] 77 | viewpad = np.eye(4) 78 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic 79 | lidar2img_rt = (viewpad @ lidar2cam_rt.T) 80 | lidar2img_rts.append(lidar2img_rt) 81 | 82 | cam_intrinsics.append(viewpad) 83 | lidar2cam_rts.append(lidar2cam_rt.T) 84 | 85 | input_dict.update( 86 | dict( 87 | img_timestamp=img_timestamp, 88 | img_filename=image_paths, 89 | lidar2img=lidar2img_rts, 90 | cam_intrinsic=cam_intrinsics, 91 | lidar2cam=lidar2cam_rts, 92 | )) 93 | if not self.test_mode: 94 | annos = self.get_ann_info(index) 95 | input_dict['ann_info'] = annos 96 | 97 | return input_dict 98 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/grid_mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from PIL import Image 5 | 6 | 7 | class Grid(object): 8 | def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.): 9 | self.use_h = use_h 10 | self.use_w = use_w 11 | self.rotate = rotate 12 | self.offset = offset 13 | self.ratio = ratio 14 | self.mode = mode 15 | self.st_prob = prob 16 | self.prob = prob 17 | 18 | def set_prob(self, epoch, max_epoch): 19 | self.prob = self.st_prob * epoch / max_epoch 20 | 21 | def __call__(self, img, label): 22 | if np.random.rand() > self.prob: 23 | return img, label 24 | h = img.size(1) 25 | w = img.size(2) 26 | self.d1 = 2 27 | self.d2 = min(h, w) 28 | hh = int(1.5 * h) 29 | ww = int(1.5 * w) 30 | d = np.random.randint(self.d1, self.d2) 31 | if self.ratio == 1: 32 | self.l = np.random.randint(1, d) 33 | else: 34 | self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) 35 | mask = np.ones((hh, ww), np.float32) 36 | st_h = np.random.randint(d) 37 | st_w = np.random.randint(d) 38 | if self.use_h: 39 | for i in range(hh // d): 40 | s = d * i + st_h 41 | t = min(s + self.l, hh) 42 | mask[s:t, :] *= 0 43 | if self.use_w: 44 | for i in range(ww // d): 45 | s = d * i + st_w 46 | t = min(s + self.l, ww) 47 | mask[:, s:t] *= 0 48 | 49 | r = np.random.randint(self.rotate) 50 | mask = Image.fromarray(np.uint8(mask)) 51 | mask = mask.rotate(r) 52 | mask = np.asarray(mask) 53 | mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w] 54 | 55 | mask = torch.from_numpy(mask).float() 56 | if self.mode == 1: 57 | mask = 1 - mask 58 | 59 | mask = mask.expand_as(img) 60 | if self.offset: 61 | offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float() 62 | offset = (1 - mask) * offset 63 | img = img * mask + offset 64 | else: 65 | img = img * mask 66 | 67 | return img, label 68 | 69 | 70 | class GridMask(nn.Module): 71 | def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.): 72 | super(GridMask, self).__init__() 73 | self.use_h = use_h 74 | self.use_w = use_w 75 | self.rotate = rotate 76 | self.offset = offset 77 | self.ratio = ratio 78 | self.mode = mode 79 | self.st_prob = prob 80 | self.prob = prob 81 | 82 | def set_prob(self, epoch, max_epoch): 83 | self.prob = self.st_prob * epoch / max_epoch # + 1.#0.5 84 | 85 | def forward(self, x): 86 | if np.random.rand() > self.prob or not self.training: 87 | return x 88 | n, c, h, w = x.size() 89 | x = x.view(-1, h, w) 90 | hh = int(1.5 * h) 91 | ww = int(1.5 * w) 92 | d = np.random.randint(2, h) 93 | self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) 94 | mask = np.ones((hh, ww), np.float32) 95 | st_h = np.random.randint(d) 96 | st_w = np.random.randint(d) 97 | if self.use_h: 98 | for i in range(hh // d): 99 | s = d * i + st_h 100 | t = min(s + self.l, hh) 101 | mask[s:t, :] *= 0 102 | if self.use_w: 103 | for i in range(ww // d): 104 | s = d * i + st_w 105 | t = min(s + self.l, ww) 106 | mask[:, s:t] *= 0 107 | 108 | r = np.random.randint(self.rotate) 109 | mask = Image.fromarray(np.uint8(mask)) 110 | mask = mask.rotate(r) 111 | mask = np.asarray(mask) 112 | mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w] 113 | 114 | mask = torch.from_numpy(mask).float().cuda() 115 | if self.mode == 1: 116 | mask = 1 - mask 117 | mask = mask.expand_as(x) 118 | if self.offset: 119 | offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float().cuda() 120 | x = x * mask + offset * (1 - mask) 121 | else: 122 | x = x * mask 123 | 124 | return x.view(n, c, h, w) 125 | -------------------------------------------------------------------------------- /tools/create_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os.path as osp 4 | 5 | from data_converter import nuscenes_converter 6 | from data_converter.create_gt_database import create_groundtruth_database 7 | 8 | 9 | def nuscenes_data_prep(root_path, 10 | info_prefix, 11 | version, 12 | dataset_name, 13 | out_dir, 14 | max_sweeps=10): 15 | """Prepare data related to nuScenes dataset. 16 | 17 | Related data consists of '.pkl' files recording basic infos, 18 | 2D annotations and groundtruth database. 19 | 20 | Args: 21 | root_path (str): Path of dataset root. 22 | info_prefix (str): The prefix of info filenames. 23 | version (str): Dataset version. 24 | dataset_name (str): The dataset class name. 25 | out_dir (str): Output directory of the groundtruth database info. 26 | max_sweeps (int, optional): Number of input consecutive frames. 27 | Default: 10 28 | """ 29 | nuscenes_converter.create_nuscenes_infos( 30 | root_path, info_prefix, version=version, max_sweeps=max_sweeps) 31 | 32 | if version == 'v1.0-test': 33 | info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') 34 | nuscenes_converter.export_2d_annotation( 35 | root_path, info_test_path, version=version) 36 | return 37 | 38 | info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') 39 | info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') 40 | nuscenes_converter.export_2d_annotation( 41 | root_path, info_train_path, version=version) 42 | nuscenes_converter.export_2d_annotation( 43 | root_path, info_val_path, version=version) 44 | create_groundtruth_database(dataset_name, root_path, info_prefix, 45 | f'{out_dir}/{info_prefix}_infos_train.pkl') 46 | 47 | 48 | parser = argparse.ArgumentParser(description='Data converter arg parser') 49 | parser.add_argument('dataset', metavar='kitti', help='name of the dataset') 50 | parser.add_argument( 51 | '--root-path', 52 | type=str, 53 | default='./data/kitti', 54 | help='specify the root path of dataset') 55 | parser.add_argument( 56 | '--version', 57 | type=str, 58 | default='v1.0', 59 | required=False, 60 | help='specify the dataset version, no need for kitti') 61 | parser.add_argument( 62 | '--max-sweeps', 63 | type=int, 64 | default=10, 65 | required=False, 66 | help='specify sweeps of lidar per example') 67 | parser.add_argument( 68 | '--with-plane', 69 | action='store_true', 70 | help='Whether to use plane information for kitti.') 71 | parser.add_argument( 72 | '--num-points', 73 | type=int, 74 | default=-1, 75 | help='Number of points to sample for indoor datasets.') 76 | parser.add_argument( 77 | '--out-dir', 78 | type=str, 79 | default='./data/kitti', 80 | required=False, 81 | help='name of info pkl') 82 | parser.add_argument('--extra-tag', type=str, default='kitti') 83 | parser.add_argument( 84 | '--workers', type=int, default=4, help='number of threads to be used') 85 | args = parser.parse_args() 86 | 87 | if __name__ == '__main__': 88 | import importlib 89 | 90 | importlib.import_module('projects.mmdet3d_plugin') 91 | 92 | if args.dataset == 'nuscenes' and args.version != 'v1.0-mini': 93 | train_version = f'{args.version}-trainval' 94 | nuscenes_data_prep( 95 | root_path=args.root_path, 96 | info_prefix=args.extra_tag, 97 | version=train_version, 98 | dataset_name='CustomNuScenesDataset', 99 | out_dir=args.out_dir, 100 | max_sweeps=args.max_sweeps) 101 | test_version = f'{args.version}-test' 102 | nuscenes_data_prep( 103 | root_path=args.root_path, 104 | info_prefix=args.extra_tag, 105 | version=test_version, 106 | dataset_name='CustomNuScenesDataset', 107 | out_dir=args.out_dir, 108 | max_sweeps=args.max_sweeps) 109 | elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': 110 | train_version = f'{args.version}' 111 | nuscenes_data_prep( 112 | root_path=args.root_path, 113 | info_prefix=args.extra_tag, 114 | version=train_version, 115 | dataset_name='CustomNuScenesDataset', 116 | out_dir=args.out_dir, 117 | max_sweeps=args.max_sweeps) 118 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/moad_transformer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | import numpy as np 11 | import torch 12 | import torch.nn as nn 13 | from einops import rearrange 14 | from mmcv.cnn import xavier_init 15 | from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence 16 | from mmcv.runner.base_module import BaseModule 17 | from mmdet.models.utils.builder import TRANSFORMER 18 | 19 | 20 | @TRANSFORMER.register_module() 21 | class MOADTransformer(BaseModule): 22 | def __init__( 23 | self, 24 | use_type_embed=True, 25 | use_cam_embed=False, 26 | encoder=None, 27 | decoder=None, 28 | init_cfg=None, 29 | cross=False 30 | ): 31 | super(MOADTransformer, self).__init__(init_cfg=init_cfg) 32 | 33 | if encoder is not None: 34 | self.encoder = build_transformer_layer_sequence(encoder) 35 | else: 36 | self.encoder = None 37 | self.decoder = build_transformer_layer_sequence(decoder) 38 | self.embed_dims = self.decoder.embed_dims 39 | self.use_type_embed = use_type_embed 40 | self.use_cam_embed = use_cam_embed 41 | 42 | if self.use_type_embed: 43 | self.bev_type_embed = nn.Parameter(torch.randn(self.embed_dims)) 44 | self.rv_type_embed = nn.Parameter(torch.randn(self.embed_dims)) 45 | else: 46 | self.bev_type_embed = None 47 | self.rv_type_embed = None 48 | 49 | if self.use_cam_embed: 50 | self.cam_embed = nn.Sequential( 51 | nn.Conv1d(16, self.embed_dims, kernel_size=1), 52 | nn.BatchNorm1d(self.embed_dims), 53 | nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1), 54 | nn.BatchNorm1d(self.embed_dims), 55 | nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1), 56 | nn.BatchNorm1d(self.embed_dims) 57 | ) 58 | else: 59 | self.cam_embed = None 60 | 61 | self.cross = cross 62 | 63 | def init_weights(self): 64 | # follow the official DETR to init parameters 65 | for m in self.modules(): 66 | if hasattr(m, 'weight') and m.weight.dim() > 1: 67 | xavier_init(m, distribution='uniform') 68 | self._is_init = True 69 | 70 | def forward(self, x, x_img, bev_query_embed, rv_query_embed, bev_pos_embed, rv_pos_embed, img_metas, 71 | attn_masks=None, modalities=None, reg_branch=None): 72 | bs, c, h, w = x.shape 73 | bev_memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c] 74 | rv_memory = rearrange(x_img, "(bs v) c h w -> (v h w) bs c", bs=bs) 75 | 76 | bev_pos_embed = bev_pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c] 77 | rv_pos_embed = rearrange(rv_pos_embed, "(bs v) h w c -> (v h w) bs c", bs=bs) 78 | 79 | if self.use_type_embed: 80 | bev_query_embed = bev_query_embed + self.bev_type_embed 81 | rv_query_embed = rv_query_embed + self.rv_type_embed 82 | 83 | if self.use_cam_embed: 84 | imgs2lidars = np.stack([np.linalg.inv(meta['lidar2img']) for meta in img_metas]) 85 | imgs2lidars = torch.from_numpy(imgs2lidars).float().to(x.device) 86 | imgs2lidars = imgs2lidars.flatten(-2).permute(0, 2, 1) 87 | imgs2lidars = self.cam_embed(imgs2lidars) 88 | imgs2lidars = imgs2lidars.permute(0, 2, 1).reshape(-1, self.embed_dims, 1, 1) 89 | imgs2lidars = imgs2lidars.repeat(1, 1, *x_img.shape[-2:]) 90 | imgs2lidars = rearrange(imgs2lidars, '(bs v) c h w -> (v h w) bs c', bs=bs) 91 | 92 | out_decs = [] 93 | for modality in modalities: 94 | if modality == "fused": 95 | memory, pos_embed = (torch.cat([bev_memory, rv_memory], dim=0), 96 | torch.cat([bev_pos_embed, rv_pos_embed], dim=0)) 97 | memory_v = memory 98 | query_embed = bev_query_embed + rv_query_embed 99 | elif modality == "bev": 100 | memory, pos_embed = bev_memory, bev_pos_embed 101 | memory_v = memory 102 | query_embed = bev_query_embed 103 | else: 104 | memory, pos_embed = rv_memory, rv_pos_embed 105 | memory_v = memory 106 | if self.cam_embed is not None: 107 | memory_v = memory_v * imgs2lidars 108 | query_embed = rv_query_embed 109 | 110 | query_embed = query_embed.transpose(0, 1) # [bs, num_query, dim] -> [num_query, bs, dim] 111 | target = torch.zeros_like(query_embed) 112 | 113 | # out_dec: [num_layers, num_query, bs, dim] 114 | out_dec = self.decoder( 115 | query=target, 116 | key=memory, 117 | value=memory_v, 118 | query_pos=query_embed, 119 | key_pos=pos_embed, 120 | attn_masks=[attn_masks, None], 121 | reg_branch=reg_branch, 122 | ) 123 | out_decs.append(out_dec.transpose(1, 2)) 124 | 125 | return out_decs 126 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/pme_transformer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | import copy 11 | 12 | import torch 13 | import torch.nn as nn 14 | from mmcv.cnn import xavier_init 15 | from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence 16 | from mmcv.runner.base_module import BaseModule 17 | from mmdet.models.utils.builder import TRANSFORMER 18 | from mmdet3d.models import builder 19 | 20 | from projects.mmdet3d_plugin.models.dense_heads.meformer_head import pos2embed 21 | 22 | 23 | @TRANSFORMER.register_module() 24 | class PMETransformer(BaseModule): 25 | def __init__( 26 | self, 27 | decoder=None, 28 | heads=None, 29 | separate_head=None, 30 | num_classes=None, 31 | init_cfg=None 32 | ): 33 | super(PMETransformer, self).__init__(init_cfg=init_cfg) 34 | self.dist_scaler = nn.Parameter(torch.randn(1), requires_grad=True) 35 | self.dist_bias = nn.Parameter(torch.randn(1), requires_grad=True) 36 | 37 | self.decoder = build_transformer_layer_sequence(decoder) 38 | 39 | self.embed_dims = self.decoder.embed_dims 40 | self.num_layers = decoder["num_layers"] 41 | self.num_heads = decoder["transformerlayers"]["attn_cfgs"][0]["num_heads"] 42 | self.box_pos_embedding = nn.Sequential( 43 | nn.Linear(self.embed_dims * 2, self.embed_dims), 44 | nn.ReLU(inplace=True), 45 | nn.Linear(self.embed_dims, self.embed_dims) 46 | ) 47 | self.modality_proj = nn.ModuleDict({ 48 | "fused": nn.Sequential( 49 | nn.Linear(self.embed_dims, self.embed_dims), 50 | nn.LayerNorm(self.embed_dims) 51 | ), 52 | "bev": nn.Sequential( 53 | nn.Linear(self.embed_dims, self.embed_dims), 54 | nn.LayerNorm(self.embed_dims) 55 | ), 56 | "img": nn.Sequential( 57 | nn.Linear(self.embed_dims, self.embed_dims), 58 | nn.LayerNorm(self.embed_dims) 59 | ) 60 | }) 61 | 62 | self.task_heads = nn.ModuleList() 63 | for num_cls in num_classes: 64 | heads = copy.deepcopy(heads) 65 | heads.update(dict(cls_logits=(num_cls, 2))) 66 | separate_head.update( 67 | in_channels=self.embed_dims, 68 | heads=heads, num_cls=num_cls, 69 | groups=decoder.num_layers 70 | ) 71 | self.task_heads.append(builder.build_head(separate_head)) 72 | 73 | def init_weights(self): 74 | # follow the official DETR to init parameters 75 | for m in self.decoder.modules(): 76 | if hasattr(m, 'weight') and m.weight.dim() > 1: 77 | xavier_init(m, distribution='uniform') 78 | 79 | self._is_init = True 80 | 81 | def forward( 82 | self, 83 | x, 84 | reference, 85 | outs, 86 | modalities, 87 | num_queries_per_modality, 88 | task_id, 89 | pc_range, 90 | attn_masks=None 91 | ): 92 | x = x[-1].transpose(0, 1) 93 | x = list(x.split(num_queries_per_modality, dim=0)) 94 | x_proj = [] 95 | 96 | for i, modality in enumerate(modalities): 97 | x_proj.append(self.modality_proj[modality](x[i])) 98 | 99 | target = x_proj[modalities.index("fused")] 100 | memory = torch.cat(x_proj, dim=0) 101 | 102 | center = outs["center"][-1] 103 | 104 | box_pos_embed = pos2embed(center, self.embed_dims) 105 | box_pos_embed = self.box_pos_embedding(box_pos_embed).transpose(0, 1) 106 | box_pos_embed = list(box_pos_embed.split(num_queries_per_modality, dim=0)) 107 | 108 | query_box_pos_embed = box_pos_embed[modalities.index("fused")] 109 | key_box_pos_embed = torch.cat(box_pos_embed, dim=0) 110 | 111 | center = list(center.split(num_queries_per_modality, dim=1)) 112 | center_q = center[modalities.index("fused")] 113 | center_kv = torch.cat(center, dim=1) 114 | dist = (center_q.unsqueeze(2) - center_kv.unsqueeze(1)).norm(p=2, dim=-1) 115 | dist_mask = dist * self.dist_scaler + self.dist_bias 116 | 117 | if attn_masks is None: 118 | attn_masks = torch.zeros((target.shape[0], target.shape[0]), dtype=torch.bool, device=target.device) 119 | 120 | attn_masks = torch.zeros_like(attn_masks, dtype=torch.float).float().masked_fill(attn_masks, float("-inf")) 121 | attn_masks = attn_masks.repeat(1, len(x_proj)) 122 | attn_masks = attn_masks + dist_mask 123 | attn_masks = attn_masks.unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) 124 | 125 | outs_dec = self.decoder( 126 | query=target, 127 | key=memory, 128 | value=memory, 129 | query_pos=query_box_pos_embed, 130 | key_pos=key_box_pos_embed, 131 | attn_masks=[attn_masks] 132 | ) 133 | 134 | outs_dec = outs_dec.transpose(1, 2) 135 | outs = self.task_heads[task_id](outs_dec) 136 | 137 | reference = reference.split(num_queries_per_modality, dim=1)[modalities.index("fused")] 138 | 139 | center = (outs['center'] + reference[None, :, :, :2]).sigmoid() 140 | height = (outs['height'] + reference[None, :, :, 2:3]).sigmoid() 141 | _center, _height = center.new_zeros(center.shape), height.new_zeros(height.shape) 142 | _center[..., 0:1] = center[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0] 143 | _center[..., 1:2] = center[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1] 144 | _height[..., 0:1] = height[..., 0:1] * (pc_range[5] - pc_range[2]) + pc_range[2] 145 | outs['center'] = _center 146 | outs['height'] = _height 147 | 148 | return outs 149 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 2 | 3 | import torch 4 | import torch.nn as nn 5 | from einops import rearrange 6 | from flash_attn.bert_padding import unpad_input 7 | from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func 8 | from mmcv.runner import auto_fp16 9 | from torch.nn.functional import linear 10 | from torch.nn.init import xavier_uniform_, constant_ 11 | 12 | 13 | def _in_projection_packed(q, k, v, w, b=None): 14 | w_q, w_k, w_v = w.chunk(3) 15 | if b is None: 16 | b_q = b_k = b_v = None 17 | else: 18 | b_q, b_k, b_v = b.chunk(3) 19 | return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) 20 | 21 | 22 | class FlashAttention(nn.Module): 23 | """Implement the scaled dot product attention with softmax. 24 | Arguments 25 | --------- 26 | softmax_scale: The temperature to use for the softmax attention. 27 | (default: 1/sqrt(d_keys) where d_keys is computed at 28 | runtime) 29 | attention_dropout: The dropout rate to apply to the attention 30 | (default: 0.1) 31 | """ 32 | 33 | def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): 34 | super().__init__() 35 | self.softmax_scale = softmax_scale 36 | self.dropout_p = attention_dropout 37 | self.fp16_enabled = True 38 | 39 | @auto_fp16(apply_to=('q', 'kv'), out_fp32=True) 40 | def forward(self, q, kv, 41 | causal=False, 42 | key_padding_mask=None): 43 | """Implements the multihead softmax attention. 44 | Arguments 45 | --------- 46 | q: The tensor containing the query. (B, T, H, D) 47 | kv: The tensor containing the key, and value. (B, S, 2, H, D) 48 | key_padding_mask: a bool tensor of shape (B, S) 49 | """ 50 | assert q.dtype in [torch.float16, torch.bfloat16] and kv.dtype in [torch.float16, torch.bfloat16] 51 | assert q.is_cuda and kv.is_cuda 52 | assert q.shape[0] == kv.shape[0] and q.shape[-2] == kv.shape[-2] and q.shape[-1] == kv.shape[-1] 53 | 54 | batch_size = q.shape[0] 55 | seqlen_q, seqlen_k = q.shape[1], kv.shape[1] 56 | if key_padding_mask is None: 57 | q, kv = rearrange(q, 'b s ... -> (b s) ...'), rearrange(kv, 'b s ... -> (b s) ...') 58 | max_sq, max_sk = seqlen_q, seqlen_k 59 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, 60 | device=q.device) 61 | cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, 62 | device=kv.device) 63 | output = flash_attn_unpadded_kvpacked_func( 64 | q, kv, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk, 65 | self.dropout_p if self.training else 0.0, 66 | softmax_scale=self.softmax_scale, causal=causal 67 | ) 68 | output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) 69 | else: 70 | nheads = kv.shape[-2] 71 | q = rearrange(q, 'b s ... -> (b s) ...') 72 | max_sq = seqlen_q 73 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, 74 | device=q.device) 75 | x = rearrange(kv, 'b s two h d -> b s (two h d)') 76 | x_unpad, indices, cu_seqlens_k, max_sk = unpad_input(x, key_padding_mask) 77 | x_unpad = rearrange(x_unpad, 'nnz (two h d) -> nnz two h d', two=2, h=nheads) 78 | output_unpad = flash_attn_unpadded_kvpacked_func( 79 | q, x_unpad, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk, 80 | self.dropout_p if self.training else 0.0, 81 | softmax_scale=self.softmax_scale, causal=causal 82 | ) 83 | output = rearrange(output_unpad, '(b s) ... -> b s ...', b=batch_size) 84 | 85 | return output, None 86 | 87 | 88 | class FlashMHA(nn.Module): 89 | 90 | def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, attention_dropout=0.0, 91 | causal=False, device=None, dtype=None, **kwargs) -> None: 92 | assert batch_first 93 | factory_kwargs = {'device': device, 'dtype': dtype} 94 | super().__init__() 95 | self.embed_dim = embed_dim 96 | self.causal = causal 97 | self.bias = bias 98 | 99 | self.num_heads = num_heads 100 | assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads" 101 | self.head_dim = self.embed_dim // num_heads 102 | assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8" 103 | 104 | self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim))) 105 | if bias: 106 | self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim)) 107 | else: 108 | self.register_parameter('in_proj_bias', None) 109 | self.inner_attn = FlashAttention(attention_dropout=attention_dropout, **factory_kwargs) 110 | self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) 111 | self._reset_parameters() 112 | 113 | def _reset_parameters(self) -> None: 114 | xavier_uniform_(self.in_proj_weight) 115 | if self.in_proj_bias is not None: 116 | constant_(self.in_proj_bias, 0.) 117 | constant_(self.out_proj.bias, 0.) 118 | 119 | def forward(self, q, k, v, key_padding_mask=None): 120 | """x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) 121 | key_padding_mask: bool tensor of shape (batch, seqlen) 122 | """ 123 | # q, k, v = self.Wq(q), self.Wk(k), self.Wv(v) 124 | q, k, v = _in_projection_packed(q, k, v, self.in_proj_weight, self.in_proj_bias) 125 | q = rearrange(q, 'b s (h d) -> b s h d', h=self.num_heads) 126 | k = rearrange(k, 'b s (h d) -> b s h d', h=self.num_heads) 127 | v = rearrange(v, 'b s (h d) -> b s h d', h=self.num_heads) 128 | kv = torch.stack([k, v], dim=2) 129 | 130 | context, attn_weights = self.inner_attn(q, kv, key_padding_mask=key_padding_mask, causal=self.causal) 131 | return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')), attn_weights 132 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/dense_heads/separate_task_head.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | 8 | import torch 9 | import torch.nn as nn 10 | from einops import rearrange 11 | from mmcv.runner import BaseModule 12 | from mmdet.models import HEADS 13 | 14 | 15 | class LayerNormFunction(torch.autograd.Function): 16 | 17 | @staticmethod 18 | def forward(ctx, x, weight, bias, groups, eps): 19 | ctx.groups = groups 20 | ctx.eps = eps 21 | N, C, L = x.size() 22 | x = x.view(N, groups, C // groups, L) 23 | mu = x.mean(2, keepdim=True) 24 | var = (x - mu).pow(2).mean(2, keepdim=True) 25 | y = (x - mu) / (var + eps).sqrt() 26 | ctx.save_for_backward(y, var, weight) 27 | y = weight.view(1, C, 1) * y.view(N, C, L) + bias.view(1, C, 1) 28 | return y 29 | 30 | @staticmethod 31 | def backward(ctx, grad_output): 32 | groups = ctx.groups 33 | eps = ctx.eps 34 | 35 | N, C, L = grad_output.size() 36 | y, var, weight = ctx.saved_variables 37 | g = grad_output * weight.view(1, C, 1) 38 | g = g.view(N, groups, C // groups, L) 39 | mean_g = g.mean(dim=2, keepdim=True) 40 | mean_gy = (g * y).mean(dim=2, keepdim=True) 41 | gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g) 42 | return gx.view(N, C, L), (grad_output * y.view(N, C, L)).sum(dim=2).sum(dim=0), grad_output.sum(dim=2).sum( 43 | dim=0), None, None 44 | 45 | 46 | class GroupLayerNorm1d(nn.Module): 47 | 48 | def __init__(self, channels, groups=1, eps=1e-6): 49 | super(GroupLayerNorm1d, self).__init__() 50 | self.register_parameter('weight', nn.Parameter(torch.ones(channels))) 51 | self.register_parameter('bias', nn.Parameter(torch.zeros(channels))) 52 | self.groups = groups 53 | self.eps = eps 54 | 55 | def forward(self, x): 56 | return LayerNormFunction.apply(x, self.weight, self.bias, self.groups, self.eps) 57 | 58 | 59 | @HEADS.register_module() 60 | class SeparateTaskHead(BaseModule): 61 | """SeparateHead for CenterHead. 62 | 63 | Args: 64 | in_channels (int): Input channels for conv_layer. 65 | heads (dict): Conv information. 66 | head_conv (int): Output channels. 67 | Default: 64. 68 | final_kernal (int): Kernal size for the last conv layer. 69 | Deafult: 1. 70 | init_bias (float): Initial bias. Default: -2.19. 71 | conv_cfg (dict): Config of conv layer. 72 | Default: dict(type='Conv2d') 73 | norm_cfg (dict): Config of norm layer. 74 | Default: dict(type='BN2d'). 75 | bias (str): Type of bias. Default: 'auto'. 76 | """ 77 | 78 | def __init__(self, 79 | in_channels, 80 | heads, 81 | groups=1, 82 | head_conv=64, 83 | final_kernel=1, 84 | init_bias=-2.19, 85 | init_cfg=None, 86 | **kwargs): 87 | assert init_cfg is None, 'To prevent abnormal initialization ' \ 88 | 'behavior, init_cfg is not allowed to be set' 89 | super(SeparateTaskHead, self).__init__(init_cfg=init_cfg) 90 | self.heads = heads 91 | self.groups = groups 92 | self.init_bias = init_bias 93 | for head in self.heads: 94 | classes, num_conv = self.heads[head] 95 | 96 | conv_layers = [] 97 | c_in = in_channels 98 | for i in range(num_conv - 1): 99 | conv_layers.extend([ 100 | nn.Conv1d( 101 | c_in * groups, 102 | head_conv * groups, 103 | kernel_size=final_kernel, 104 | stride=1, 105 | padding=final_kernel // 2, 106 | groups=groups, 107 | bias=False), 108 | GroupLayerNorm1d(head_conv * groups, groups=groups), 109 | nn.ReLU(inplace=True) 110 | ]) 111 | c_in = head_conv 112 | 113 | conv_layers.append( 114 | nn.Conv1d( 115 | head_conv * groups, 116 | classes * groups, 117 | kernel_size=final_kernel, 118 | stride=1, 119 | padding=final_kernel // 2, 120 | groups=groups, 121 | bias=True)) 122 | conv_layers = nn.Sequential(*conv_layers) 123 | 124 | self.__setattr__(head, conv_layers) 125 | 126 | if init_cfg is None: 127 | self.init_cfg = dict(type='Kaiming', layer='Conv1d') 128 | 129 | def init_weights(self): 130 | """Initialize weights.""" 131 | super().init_weights() 132 | for head in self.heads: 133 | if head == 'cls_logits': 134 | self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) 135 | 136 | def forward(self, x): 137 | """Forward function for SepHead. 138 | 139 | Args: 140 | x (torch.Tensor): Input feature map with the shape of 141 | [N, B, query, C]. 142 | 143 | Returns: 144 | dict[str: torch.Tensor]: contains the following keys: 145 | 146 | -reg (torch.Tensor): 2D regression value with the \ 147 | shape of [N, B, query, 2]. 148 | -height (torch.Tensor): Height value with the \ 149 | shape of [N, B, query, 1]. 150 | -dim (torch.Tensor): Size value with the shape \ 151 | of [N, B, query, 3]. 152 | -rot (torch.Tensor): Rotation value with the \ 153 | shape of [N, B, query, 2]. 154 | -vel (torch.Tensor): Velocity value with the \ 155 | shape of [N, B, query, 2]. 156 | """ 157 | N, B, query_num, c1 = x.shape 158 | x = rearrange(x, "n b q c -> b (n c) q") 159 | ret_dict = dict() 160 | 161 | for head in self.heads: 162 | head_output = self.__getattr__(head)(x) 163 | ret_dict[head] = rearrange(head_output, "b (n c) q -> n b q c", n=N) 164 | 165 | return ret_dict 166 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/multi_task_bbox_coder.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | 8 | import torch 9 | from mmdet.core.bbox import BaseBBoxCoder 10 | from mmdet.core.bbox.builder import BBOX_CODERS 11 | 12 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox 13 | 14 | 15 | @BBOX_CODERS.register_module() 16 | class MultiTaskBBoxCoder(BaseBBoxCoder): 17 | """Bbox coder for NMS-free detector. 18 | Args: 19 | pc_range (list[float]): Range of point cloud. 20 | post_center_range (list[float]): Limit of the center. 21 | Default: None. 22 | max_num (int): Max number to be kept. Default: 100. 23 | score_threshold (float): Threshold to filter boxes based on score. 24 | Default: None. 25 | code_size (int): Code size of bboxes. Default: 9 26 | """ 27 | 28 | def __init__(self, 29 | pc_range, 30 | voxel_size=None, 31 | post_center_range=None, 32 | max_num=100, 33 | score_threshold=None, 34 | num_classes=10): 35 | 36 | self.pc_range = pc_range 37 | self.voxel_size = voxel_size 38 | self.post_center_range = post_center_range 39 | self.max_num = max_num 40 | self.score_threshold = score_threshold 41 | self.num_classes = num_classes 42 | 43 | def encode(self): 44 | pass 45 | 46 | def decode_single(self, cls_scores, bbox_preds, task_ids): 47 | """Decode bboxes. 48 | Args: 49 | cls_scores (Tensor): Outputs from the classification head, \ 50 | shape [num_query, cls_out_channels]. Note \ 51 | cls_out_channels should includes background. 52 | bbox_preds (Tensor): Outputs from the regression \ 53 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 54 | Shape [num_query, 9]. 55 | Returns: 56 | list[dict]: Decoded boxes. 57 | """ 58 | max_num = self.max_num 59 | num_query = cls_scores.shape[0] 60 | 61 | cls_scores = cls_scores.sigmoid() 62 | scores, indexs = cls_scores.view(-1).topk(max_num) 63 | labels = indexs % self.num_classes 64 | bbox_index = indexs // self.num_classes 65 | task_index = torch.gather(task_ids, 1, labels.unsqueeze(1)).squeeze() 66 | 67 | bbox_preds = bbox_preds[task_index * num_query + bbox_index] 68 | boxes3d = denormalize_bbox(bbox_preds, self.pc_range) 69 | 70 | # use score threshold 71 | if self.score_threshold is not None: 72 | thresh_mask = scores > self.score_threshold 73 | if self.post_center_range is not None: 74 | self.post_center_range = torch.tensor(self.post_center_range, device=scores.device) 75 | mask = (boxes3d[..., :3] >= 76 | self.post_center_range[:3]).all(1) 77 | mask &= (boxes3d[..., :3] <= 78 | self.post_center_range[3:]).all(1) 79 | 80 | if self.score_threshold: 81 | mask &= thresh_mask 82 | 83 | boxes3d = boxes3d[mask] 84 | scores = scores[mask] 85 | labels = labels[mask] 86 | 87 | predictions_dict = { 88 | 'bboxes': boxes3d, 89 | 'scores': scores, 90 | 'labels': labels 91 | } 92 | return predictions_dict 93 | 94 | def decode(self, preds_dicts): 95 | """Decode bboxes. 96 | Args: 97 | all_cls_scores (Tensor): Outputs from the classification head, \ 98 | shape [nb_dec, bs, num_query, cls_out_channels]. Note \ 99 | cls_out_channels should includes background. 100 | all_bbox_preds (Tensor): Sigmoid outputs from the regression \ 101 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 102 | Shape [nb_dec, bs, num_query, 9]. 103 | Returns: 104 | list[dict]: Decoded boxes. 105 | """ 106 | task_num = len(preds_dicts) 107 | 108 | pred_bbox_list, pred_logits_list, task_ids_list, rv_box_mask_lists = [], [], [], [] 109 | for task_id in range(task_num): 110 | task_pred_dict = preds_dicts[task_id][0] 111 | task_pred_bbox = [task_pred_dict['center'][-1], task_pred_dict['height'][-1], 112 | task_pred_dict['dim'][-1], task_pred_dict['rot'][-1]] 113 | if 'vel' in task_pred_dict: 114 | task_pred_bbox.append(task_pred_dict['vel'][-1]) 115 | task_pred_bbox = torch.cat(task_pred_bbox, dim=-1) 116 | task_pred_logits = task_pred_dict['cls_logits'][-1] 117 | pred_bbox_list.append(task_pred_bbox) 118 | pred_logits_list.append(task_pred_logits) 119 | 120 | if "rv_box_mask" in task_pred_dict: 121 | rv_box_mask_lists.append(task_pred_dict["rv_box_mask"]) 122 | else: 123 | rv_box_mask_lists.append(task_pred_dict["cls_logits"].new_ones(task_pred_dict["cls_logits"].shape[1], 6, 124 | task_pred_dict["cls_logits"].shape[ 125 | 2]).to(torch.bool)) 126 | 127 | task_ids = task_pred_logits.new_ones(task_pred_logits.shape).int() * task_id 128 | task_ids_list.append(task_ids) 129 | 130 | all_pred_logits = torch.cat(pred_logits_list, dim=-1) # bs * nq * 10 131 | all_pred_bbox = torch.cat(pred_bbox_list, dim=1) # bs * (task nq) * 10 132 | all_task_ids = torch.cat(task_ids_list, dim=-1) # bs * nq * 10 133 | all_rv_box_masks = torch.cat(rv_box_mask_lists, dim=-1) 134 | 135 | batch_size = all_pred_logits.shape[0] 136 | predictions_list = [] 137 | for i in range(batch_size): 138 | rv_box_mask = all_rv_box_masks[i].sum(dim=0) != 0 139 | if rv_box_mask.shape[0] != all_pred_bbox[i].shape[0]: 140 | box_mask = torch.cat([torch.ones_like(rv_box_mask), rv_box_mask]) 141 | else: 142 | box_mask = rv_box_mask 143 | 144 | pred_logits = all_pred_logits[i][box_mask] 145 | pred_bbox = all_pred_bbox[i][box_mask] 146 | task_ids = all_task_ids[i][box_mask] 147 | 148 | predictions_list.append( 149 | self.decode_single(pred_logits, pred_bbox, task_ids)) 150 | return predictions_list 151 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import torch 12 | from mmdet.core.bbox.assigners import AssignResult 13 | from mmdet.core.bbox.assigners import BaseAssigner 14 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS 15 | from mmdet.core.bbox.match_costs import build_match_cost 16 | from scipy.optimize import linear_sum_assignment 17 | 18 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox 19 | 20 | 21 | @BBOX_ASSIGNERS.register_module() 22 | class HungarianAssigner3D(BaseAssigner): 23 | """Computes one-to-one matching between predictions and ground truth. 24 | This class computes an assignment between the targets and the predictions 25 | based on the costs. The costs are weighted sum of three components: 26 | classification cost, regression L1 cost and regression iou cost. The 27 | targets don't include the no_object, so generally there are more 28 | predictions than targets. After the one-to-one matching, the un-matched 29 | are treated as backgrounds. Thus each query prediction will be assigned 30 | with `0` or a positive integer indicating the ground truth index: 31 | - 0: negative sample, no assigned gt 32 | - positive integer: positive sample, index (1-based) of assigned gt 33 | Args: 34 | cls_weight (int | float, optional): The scale factor for classification 35 | cost. Default 1.0. 36 | bbox_weight (int | float, optional): The scale factor for regression 37 | L1 cost. Default 1.0. 38 | iou_weight (int | float, optional): The scale factor for regression 39 | iou cost. Default 1.0. 40 | iou_calculator (dict | optional): The config for the iou calculation. 41 | Default type `BboxOverlaps2D`. 42 | iou_mode (str | optional): "iou" (intersection over union), "iof" 43 | (intersection over foreground), or "giou" (generalized 44 | intersection over union). Default "giou". 45 | """ 46 | 47 | def __init__(self, 48 | cls_cost=dict(type='ClassificationCost', weight=1.), 49 | reg_cost=dict(type='BBoxL1Cost', weight=1.0), 50 | iou_cost=dict(type='IoUCost', weight=0.0), 51 | pc_range=None, 52 | code_weights=None): 53 | self.cls_cost = build_match_cost(cls_cost) 54 | self.reg_cost = build_match_cost(reg_cost) 55 | self.iou_cost = build_match_cost(iou_cost) 56 | self.pc_range = pc_range 57 | self.code_weights = code_weights 58 | if self.code_weights: 59 | self.code_weights = torch.tensor(self.code_weights)[None, :].cuda() 60 | 61 | def assign(self, 62 | bbox_pred, 63 | cls_pred, 64 | gt_bboxes, 65 | gt_labels, 66 | gt_bboxes_ignore=None, 67 | eps=1e-7, 68 | code_weights=None): 69 | """Computes one-to-one matching based on the weighted costs. 70 | This method assign each query prediction to a ground truth or 71 | background. The `assigned_gt_inds` with -1 means don't care, 72 | 0 means negative sample, and positive number is the index (1-based) 73 | of assigned gt. 74 | The assignment is done in the following steps, the order matters. 75 | 1. assign every prediction to -1 76 | 2. compute the weighted costs 77 | 3. do Hungarian matching on CPU based on the costs 78 | 4. assign all to 0 (background) first, then for each matched pair 79 | between predictions and gts, treat this prediction as foreground 80 | and assign the corresponding gt index (plus 1) to it. 81 | Args: 82 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 83 | (cx, cy, w, h), which are all in range [0, 1]. Shape 84 | [num_query, 4]. 85 | cls_pred (Tensor): Predicted classification logits, shape 86 | [num_query, num_class]. 87 | gt_bboxes (Tensor): Ground truth boxes with unnormalized 88 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 89 | gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). 90 | gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are 91 | labelled as `ignored`. Default None. 92 | eps (int | float, optional): A value added to the denominator for 93 | numerical stability. Default 1e-7. 94 | Returns: 95 | :obj:`AssignResult`: The assigned result. 96 | """ 97 | assert gt_bboxes_ignore is None, \ 98 | 'Only case when gt_bboxes_ignore is None is supported.' 99 | num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) 100 | 101 | # 1. assign -1 by default 102 | assigned_gt_inds = bbox_pred.new_full((num_bboxes,), 103 | -1, 104 | dtype=torch.long) 105 | assigned_labels = bbox_pred.new_full((num_bboxes,), 106 | -1, 107 | dtype=torch.long) 108 | if num_gts == 0 or num_bboxes == 0: 109 | # No ground truth or boxes, return empty assignment 110 | if num_gts == 0: 111 | # No ground truth, assign all to background 112 | assigned_gt_inds[:] = 0 113 | return AssignResult( 114 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 115 | 116 | # 2. compute the weighted costs 117 | # classification and bboxcost. 118 | cls_cost = self.cls_cost(cls_pred, gt_labels) 119 | # regression L1 cost 120 | normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) 121 | 122 | if self.code_weights is not None: 123 | bbox_pred = bbox_pred * self.code_weights 124 | normalized_gt_bboxes = normalized_gt_bboxes * self.code_weights 125 | 126 | reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) 127 | 128 | # weighted sum of above two costs 129 | cost = cls_cost + reg_cost 130 | 131 | # 3. do Hungarian matching on CPU using linear_sum_assignment 132 | cost = cost.detach().cpu() 133 | if linear_sum_assignment is None: 134 | raise ImportError('Please run "pip install scipy" ' 135 | 'to install scipy first.') 136 | matched_row_inds, matched_col_inds = linear_sum_assignment(cost) 137 | matched_row_inds = torch.from_numpy(matched_row_inds).to( 138 | bbox_pred.device) 139 | matched_col_inds = torch.from_numpy(matched_col_inds).to( 140 | bbox_pred.device) 141 | 142 | # 4. assign backgrounds and foregrounds 143 | # assign all indices to backgrounds first 144 | assigned_gt_inds[:] = 0 145 | # assign foregrounds based on matching results 146 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 147 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] 148 | return AssignResult( 149 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 150 | -------------------------------------------------------------------------------- /tools/visual_utils/visualize_utils.py: -------------------------------------------------------------------------------- 1 | import mayavi.mlab as mlab 2 | import numpy as np 3 | import torch 4 | 5 | box_colormap = [ 6 | [1, 1, 1], 7 | [0, 1, 0], 8 | [0, 1, 1], 9 | [1, 1, 0], 10 | ] 11 | 12 | 13 | def check_numpy_to_torch(x): 14 | if isinstance(x, np.ndarray): 15 | return torch.from_numpy(x).float(), True 16 | return x, False 17 | 18 | 19 | def rotate_points_along_z(points, angle): 20 | """ 21 | Args: 22 | points: (B, N, 3 + C) 23 | angle: (B), angle along z-axis, angle increases x ==> y 24 | Returns: 25 | 26 | """ 27 | points, is_numpy = check_numpy_to_torch(points) 28 | angle, _ = check_numpy_to_torch(angle) 29 | 30 | cosa = torch.cos(angle) 31 | sina = torch.sin(angle) 32 | zeros = angle.new_zeros(points.shape[0]) 33 | ones = angle.new_ones(points.shape[0]) 34 | rot_matrix = torch.stack(( 35 | cosa, sina, zeros, 36 | -sina, cosa, zeros, 37 | zeros, zeros, ones 38 | ), dim=1).view(-1, 3, 3).float() 39 | points_rot = torch.matmul(points[:, :, 0:3], rot_matrix) 40 | points_rot = torch.cat((points_rot, points[:, :, 3:]), dim=-1) 41 | return points_rot.numpy() if is_numpy else points_rot 42 | 43 | 44 | def boxes_to_corners_3d(boxes3d): 45 | """ 46 | 7 -------- 4 47 | /| /| 48 | 6 -------- 5 . 49 | | | | | 50 | . 3 -------- 0 51 | |/ |/ 52 | 2 -------- 1 53 | Args: 54 | boxes3d: (N, 7) [x, y, z, dx, dy, dz, heading], (x, y, z) is the box center 55 | 56 | Returns: 57 | """ 58 | boxes3d, is_numpy = check_numpy_to_torch(boxes3d) 59 | 60 | template = boxes3d.new_tensor(( 61 | [1, 1, -1], [1, -1, -1], [-1, -1, -1], [-1, 1, -1], 62 | [1, 1, 1], [1, -1, 1], [-1, -1, 1], [-1, 1, 1], 63 | )) / 2 64 | 65 | corners3d = boxes3d[:, None, 3:6].repeat(1, 8, 1) * template[None, :, :] 66 | corners3d = rotate_points_along_z(corners3d.view(-1, 8, 3), boxes3d[:, 6]).view(-1, 8, 3) 67 | corners3d += boxes3d[:, None, 0:3] 68 | 69 | return corners3d.numpy() if is_numpy else corners3d 70 | 71 | 72 | def visualize_pts(pts, fig=None, bgcolor=(0, 0, 0), fgcolor=(1.0, 1.0, 1.0), 73 | show_intensity=False, size=(600, 600), draw_origin=True): 74 | if not isinstance(pts, np.ndarray): 75 | pts = pts.cpu().numpy() 76 | if fig is None: 77 | fig = mlab.figure(figure=None, bgcolor=bgcolor, fgcolor=fgcolor, engine=None, size=size) 78 | 79 | if show_intensity: 80 | G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], pts[:, 3], mode='point', 81 | colormap='gnuplot', scale_factor=1, figure=fig) 82 | else: 83 | G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], mode='point', 84 | colormap='gnuplot', scale_factor=1, figure=fig) 85 | if draw_origin: 86 | mlab.points3d(0, 0, 0, color=(1, 1, 1), mode='cube', scale_factor=0.2) 87 | mlab.plot3d([0, 3], [0, 0], [0, 0], color=(0, 0, 1), tube_radius=0.1) 88 | mlab.plot3d([0, 0], [0, 3], [0, 0], color=(0, 1, 0), tube_radius=0.1) 89 | mlab.plot3d([0, 0], [0, 0], [0, 3], color=(1, 0, 0), tube_radius=0.1) 90 | 91 | return fig 92 | 93 | 94 | def draw_sphere_pts(pts, color=(0, 1, 0), fig=None, bgcolor=(0, 0, 0), scale_factor=0.2): 95 | if not isinstance(pts, np.ndarray): 96 | pts = pts.cpu().numpy() 97 | 98 | if fig is None: 99 | fig = mlab.figure(figure=None, bgcolor=bgcolor, fgcolor=None, engine=None, size=(600, 600)) 100 | 101 | if isinstance(color, np.ndarray) and color.shape[0] == 1: 102 | color = color[0] 103 | color = (color[0] / 255.0, color[1] / 255.0, color[2] / 255.0) 104 | 105 | if isinstance(color, np.ndarray): 106 | pts_color = np.zeros((pts.__len__(), 4), dtype=np.uint8) 107 | pts_color[:, 0:3] = color 108 | pts_color[:, 3] = 255 109 | G = mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], np.arange(0, pts_color.__len__()), mode='sphere', 110 | scale_factor=scale_factor, figure=fig) 111 | G.glyph.color_mode = 'color_by_scalar' 112 | G.glyph.scale_mode = 'scale_by_vector' 113 | G.module_manager.scalar_lut_manager.lut.table = pts_color 114 | else: 115 | mlab.points3d(pts[:, 0], pts[:, 1], pts[:, 2], mode='sphere', color=color, 116 | colormap='gnuplot', scale_factor=scale_factor, figure=fig) 117 | 118 | mlab.points3d(0, 0, 0, color=(1, 1, 1), mode='cube', scale_factor=0.2) 119 | mlab.plot3d([0, 3], [0, 0], [0, 0], color=(0, 0, 1), line_width=3, tube_radius=None, figure=fig) 120 | mlab.plot3d([0, 0], [0, 3], [0, 0], color=(0, 1, 0), line_width=3, tube_radius=None, figure=fig) 121 | mlab.plot3d([0, 0], [0, 0], [0, 3], color=(1, 0, 0), line_width=3, tube_radius=None, figure=fig) 122 | 123 | return fig 124 | 125 | 126 | def draw_grid(x1, y1, x2, y2, fig, tube_radius=None, color=(0.5, 0.5, 0.5)): 127 | mlab.plot3d([x1, x1], [y1, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig) 128 | mlab.plot3d([x2, x2], [y1, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig) 129 | mlab.plot3d([x1, x2], [y1, y1], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig) 130 | mlab.plot3d([x1, x2], [y2, y2], [0, 0], color=color, tube_radius=tube_radius, line_width=1, figure=fig) 131 | return fig 132 | 133 | 134 | def draw_multi_grid_range(fig, grid_size=20, bv_range=(-60, -60, 60, 60)): 135 | for x in range(bv_range[0], bv_range[2], grid_size): 136 | for y in range(bv_range[1], bv_range[3], grid_size): 137 | fig = draw_grid(x, y, x + grid_size, y + grid_size, fig) 138 | 139 | return fig 140 | 141 | 142 | def draw_scenes(points, gt_boxes=None, ref_boxes=None, ref_scores=None, ref_labels=None): 143 | if not isinstance(points, np.ndarray): 144 | points = points.cpu().numpy() 145 | if ref_boxes is not None and not isinstance(ref_boxes, np.ndarray): 146 | ref_boxes = ref_boxes.cpu().numpy() 147 | if gt_boxes is not None and not isinstance(gt_boxes, np.ndarray): 148 | gt_boxes = gt_boxes.cpu().numpy() 149 | if ref_scores is not None and not isinstance(ref_scores, np.ndarray): 150 | ref_scores = ref_scores.cpu().numpy() 151 | if ref_labels is not None and not isinstance(ref_labels, np.ndarray): 152 | ref_labels = ref_labels.cpu().numpy() 153 | 154 | fig = visualize_pts(points) 155 | fig = draw_multi_grid_range(fig, bv_range=(0, -40, 80, 40)) 156 | if gt_boxes is not None: 157 | corners3d = boxes_to_corners_3d(gt_boxes) 158 | fig = draw_corners3d(corners3d, fig=fig, color=(0, 0, 1), max_num=100) 159 | 160 | if ref_boxes is not None and len(ref_boxes) > 0: 161 | ref_corners3d = boxes_to_corners_3d(ref_boxes) 162 | if ref_labels is None: 163 | fig = draw_corners3d(ref_corners3d, fig=fig, color=(0, 1, 0), cls=ref_scores, max_num=100) 164 | else: 165 | for k in range(ref_labels.min(), ref_labels.max() + 1): 166 | cur_color = tuple(box_colormap[k % len(box_colormap)]) 167 | mask = (ref_labels == k) 168 | fig = draw_corners3d(ref_corners3d[mask], fig=fig, color=cur_color, cls=ref_scores[mask], max_num=100) 169 | mlab.view(azimuth=-179, elevation=54.0, distance=104.0, roll=90.0) 170 | return fig 171 | 172 | 173 | def draw_corners3d(corners3d, fig, color=(1, 1, 1), line_width=2, cls=None, tag='', max_num=500, tube_radius=None): 174 | """ 175 | :param corners3d: (N, 8, 3) 176 | :param fig: 177 | :param color: 178 | :param line_width: 179 | :param cls: 180 | :param tag: 181 | :param max_num: 182 | :return: 183 | """ 184 | import mayavi.mlab as mlab 185 | num = min(max_num, len(corners3d)) 186 | for n in range(num): 187 | b = corners3d[n] # (8, 3) 188 | 189 | if cls is not None: 190 | if isinstance(cls, np.ndarray): 191 | mlab.text3d(b[6, 0], b[6, 1], b[6, 2], '%.2f' % cls[n], scale=(0.3, 0.3, 0.3), color=color, figure=fig) 192 | else: 193 | mlab.text3d(b[6, 0], b[6, 1], b[6, 2], '%s' % cls[n], scale=(0.3, 0.3, 0.3), color=color, figure=fig) 194 | 195 | for k in range(0, 4): 196 | i, j = k, (k + 1) % 4 197 | mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, 198 | tube_radius=tube_radius, 199 | line_width=line_width, figure=fig) 200 | 201 | i, j = k + 4, (k + 1) % 4 + 4 202 | mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, 203 | tube_radius=tube_radius, 204 | line_width=line_width, figure=fig) 205 | 206 | i, j = k, k + 4 207 | mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, 208 | tube_radius=tube_radius, 209 | line_width=line_width, figure=fig) 210 | 211 | i, j = 0, 5 212 | mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, tube_radius=tube_radius, 213 | line_width=line_width, figure=fig) 214 | i, j = 1, 4 215 | mlab.plot3d([b[i, 0], b[j, 0]], [b[i, 1], b[j, 1]], [b[i, 2], b[j, 2]], color=color, tube_radius=tube_radius, 216 | line_width=line_width, figure=fig) 217 | 218 | return fig 219 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/necks/cp_fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from mmcv.cnn import ConvModule 5 | from mmcv.runner import BaseModule, auto_fp16 6 | 7 | from mmdet.models import NECKS 8 | 9 | 10 | ####This FPN remove the unused parameters which can used with checkpoint (with_cp = True in Backbone) 11 | @NECKS.register_module() 12 | class CPFPN(BaseModule): 13 | r"""Feature Pyramid Network. 14 | 15 | This is an implementation of paper `Feature Pyramid Networks for Object 16 | Detection `_. 17 | 18 | Args: 19 | in_channels (List[int]): Number of input channels per scale. 20 | out_channels (int): Number of output channels (used at each scale) 21 | num_outs (int): Number of output scales. 22 | start_level (int): Index of the start input backbone level used to 23 | build the feature pyramid. Default: 0. 24 | end_level (int): Index of the end input backbone level (exclusive) to 25 | build the feature pyramid. Default: -1, which means the last level. 26 | add_extra_convs (bool | str): If bool, it decides whether to add conv 27 | layers on top of the original feature maps. Default to False. 28 | If True, it is equivalent to `add_extra_convs='on_input'`. 29 | If str, it specifies the source feature map of the extra convs. 30 | Only the following options are allowed 31 | 32 | - 'on_input': Last feat map of neck inputs (i.e. backbone feature). 33 | - 'on_lateral': Last feature map after lateral convs. 34 | - 'on_output': The last output feature map after fpn convs. 35 | relu_before_extra_convs (bool): Whether to apply relu before the extra 36 | conv. Default: False. 37 | no_norm_on_lateral (bool): Whether to apply norm on lateral. 38 | Default: False. 39 | conv_cfg (dict): Config dict for convolution layer. Default: None. 40 | norm_cfg (dict): Config dict for normalization layer. Default: None. 41 | act_cfg (str): Config dict for activation layer in ConvModule. 42 | Default: None. 43 | upsample_cfg (dict): Config dict for interpolate layer. 44 | Default: `dict(mode='nearest')` 45 | init_cfg (dict or list[dict], optional): Initialization config dict. 46 | 47 | Example: 48 | >>> import torch 49 | >>> in_channels = [2, 3, 5, 7] 50 | >>> scales = [340, 170, 84, 43] 51 | >>> inputs = [torch.rand(1, c, s, s) 52 | ... for c, s in zip(in_channels, scales)] 53 | >>> self = FPN(in_channels, 11, len(in_channels)).eval() 54 | >>> outputs = self.forward(inputs) 55 | >>> for i in range(len(outputs)): 56 | ... print(f'outputs[{i}].shape = {outputs[i].shape}') 57 | outputs[0].shape = torch.Size([1, 11, 340, 340]) 58 | outputs[1].shape = torch.Size([1, 11, 170, 170]) 59 | outputs[2].shape = torch.Size([1, 11, 84, 84]) 60 | outputs[3].shape = torch.Size([1, 11, 43, 43]) 61 | """ 62 | 63 | def __init__(self, 64 | in_channels, 65 | out_channels, 66 | num_outs, 67 | start_level=0, 68 | end_level=-1, 69 | add_extra_convs=False, 70 | relu_before_extra_convs=False, 71 | no_norm_on_lateral=False, 72 | conv_cfg=None, 73 | norm_cfg=None, 74 | act_cfg=None, 75 | upsample_cfg=dict(mode='nearest'), 76 | init_cfg=dict( 77 | type='Xavier', layer='Conv2d', distribution='uniform')): 78 | super(CPFPN, self).__init__(init_cfg) 79 | assert isinstance(in_channels, list) 80 | self.in_channels = in_channels 81 | self.out_channels = out_channels 82 | self.num_ins = len(in_channels) 83 | self.num_outs = num_outs 84 | self.relu_before_extra_convs = relu_before_extra_convs 85 | self.no_norm_on_lateral = no_norm_on_lateral 86 | self.fp16_enabled = False 87 | self.upsample_cfg = upsample_cfg.copy() 88 | 89 | if end_level == -1: 90 | self.backbone_end_level = self.num_ins 91 | assert num_outs >= self.num_ins - start_level 92 | else: 93 | # if end_level < inputs, no extra level is allowed 94 | self.backbone_end_level = end_level 95 | assert end_level <= len(in_channels) 96 | assert num_outs == end_level - start_level 97 | self.start_level = start_level 98 | self.end_level = end_level 99 | self.add_extra_convs = add_extra_convs 100 | assert isinstance(add_extra_convs, (str, bool)) 101 | if isinstance(add_extra_convs, str): 102 | # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' 103 | assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') 104 | elif add_extra_convs: # True 105 | self.add_extra_convs = 'on_input' 106 | 107 | self.lateral_convs = nn.ModuleList() 108 | self.fpn_convs = nn.ModuleList() 109 | 110 | for i in range(self.start_level, self.backbone_end_level): 111 | l_conv = ConvModule( 112 | in_channels[i], 113 | out_channels, 114 | 1, 115 | conv_cfg=conv_cfg, 116 | norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, 117 | act_cfg=act_cfg, 118 | inplace=False) 119 | self.lateral_convs.append(l_conv) 120 | if i == 0: 121 | fpn_conv = ConvModule( 122 | out_channels, 123 | out_channels, 124 | 3, 125 | padding=1, 126 | conv_cfg=conv_cfg, 127 | norm_cfg=norm_cfg, 128 | act_cfg=act_cfg, 129 | inplace=False) 130 | self.fpn_convs.append(fpn_conv) 131 | 132 | # add extra conv layers (e.g., RetinaNet) 133 | extra_levels = num_outs - self.backbone_end_level + self.start_level 134 | if self.add_extra_convs and extra_levels >= 1: 135 | for i in range(extra_levels): 136 | if i == 0 and self.add_extra_convs == 'on_input': 137 | in_channels = self.in_channels[self.backbone_end_level - 1] 138 | else: 139 | in_channels = out_channels 140 | extra_fpn_conv = ConvModule( 141 | in_channels, 142 | out_channels, 143 | 3, 144 | stride=2, 145 | padding=1, 146 | conv_cfg=conv_cfg, 147 | norm_cfg=norm_cfg, 148 | act_cfg=act_cfg, 149 | inplace=False) 150 | self.fpn_convs.append(extra_fpn_conv) 151 | 152 | @auto_fp16() 153 | def forward(self, inputs): 154 | """Forward function.""" 155 | assert len(inputs) == len(self.in_channels) 156 | 157 | # build laterals 158 | laterals = [ 159 | lateral_conv(inputs[i + self.start_level]) 160 | for i, lateral_conv in enumerate(self.lateral_convs) 161 | ] 162 | 163 | # build top-down path 164 | used_backbone_levels = len(laterals) 165 | for i in range(used_backbone_levels - 1, 0, -1): 166 | # In some cases, fixing `scale factor` (e.g. 2) is preferred, but 167 | # it cannot co-exist with `size` in `F.interpolate`. 168 | if 'scale_factor' in self.upsample_cfg: 169 | laterals[i - 1] += F.interpolate(laterals[i], 170 | **self.upsample_cfg) 171 | else: 172 | prev_shape = laterals[i - 1].shape[2:] 173 | laterals[i - 1] += F.interpolate( 174 | laterals[i], size=prev_shape, **self.upsample_cfg) 175 | 176 | # build outputs 177 | # part 1: from original levels 178 | outs = [ 179 | self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i] for i in range(used_backbone_levels) 180 | ] 181 | # part 2: add extra levels 182 | if self.num_outs > len(outs): 183 | # use max pool to get more levels on top of outputs 184 | # (e.g., Faster R-CNN, Mask R-CNN) 185 | if not self.add_extra_convs: 186 | for i in range(self.num_outs - used_backbone_levels): 187 | outs.append(F.max_pool2d(outs[-1], 1, stride=2)) 188 | # add conv layers on top of original feature maps (RetinaNet) 189 | else: 190 | if self.add_extra_convs == 'on_input': 191 | extra_source = inputs[self.backbone_end_level - 1] 192 | elif self.add_extra_convs == 'on_lateral': 193 | extra_source = laterals[-1] 194 | elif self.add_extra_convs == 'on_output': 195 | extra_source = outs[-1] 196 | else: 197 | raise NotImplementedError 198 | outs.append(self.fpn_convs[used_backbone_levels](extra_source)) 199 | for i in range(used_backbone_levels + 1, self.num_outs): 200 | if self.relu_before_extra_convs: 201 | outs.append(self.fpn_convs[i](F.relu(outs[-1]))) 202 | else: 203 | outs.append(self.fpn_convs[i](outs[-1])) 204 | return tuple(outs) 205 | -------------------------------------------------------------------------------- /tools/data_converter/create_gt_database.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from UVTR (https://github.com/dvlab-research/UVTR) 5 | # Copyright (c) 2022 Li, Yanwei 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import pickle 12 | from os import path as osp 13 | 14 | import mmcv 15 | import numpy as np 16 | from mmcv import track_iter_progress 17 | from mmdet3d.core.bbox import box_np_ops as box_np_ops 18 | from mmdet3d.datasets import build_dataset 19 | 20 | 21 | def create_groundtruth_database(dataset_class_name, 22 | data_path, 23 | info_prefix, 24 | info_path=None, 25 | mask_anno_path=None, 26 | used_classes=None, 27 | database_save_path=None, 28 | db_info_save_path=None, 29 | relative_path=True, 30 | add_rgb=False, 31 | lidar_only=False, 32 | bev_only=False, 33 | coors_range=None, 34 | with_mask=False): 35 | """Given the raw data, generate the ground truth database. 36 | 37 | Args: 38 | dataset_class_name (str): Name of the input dataset. 39 | data_path (str): Path of the data. 40 | info_prefix (str): Prefix of the info file. 41 | info_path (str, optional): Path of the info file. 42 | Default: None. 43 | mask_anno_path (str, optional): Path of the mask_anno. 44 | Default: None. 45 | used_classes (list[str], optional): Classes have been used. 46 | Default: None. 47 | database_save_path (str, optional): Path to save database. 48 | Default: None. 49 | db_info_save_path (str, optional): Path to save db_info. 50 | Default: None. 51 | relative_path (bool, optional): Whether to use relative path. 52 | Default: True. 53 | with_mask (bool, optional): Whether to use mask. 54 | Default: False. 55 | """ 56 | print(f'Create GT Database of {dataset_class_name}') 57 | dataset_cfg = dict( 58 | type=dataset_class_name, data_root=data_path, ann_file=info_path, return_gt_info=True) 59 | 60 | if dataset_class_name == 'CustomNuScenesDataset': 61 | dataset_cfg.update( 62 | use_valid_flag=True, 63 | pipeline=[ 64 | dict( 65 | type='LoadPointsFromFile', 66 | coord_type='LIDAR', 67 | load_dim=5, 68 | use_dim=5), 69 | dict( 70 | type='LoadPointsFromMultiSweeps', 71 | sweeps_num=10, 72 | use_dim=[0, 1, 2, 3, 4], 73 | pad_empty_sweeps=True, 74 | remove_close=True), 75 | dict( 76 | type='LoadAnnotations3D', 77 | with_bbox_3d=True, 78 | with_label_3d=True) 79 | ]) 80 | 81 | dataset = build_dataset(dataset_cfg) 82 | 83 | if database_save_path is None: 84 | database_save_path = osp.join(data_path, f'{info_prefix}_gt_database') 85 | if db_info_save_path is None: 86 | db_info_save_path = osp.join(data_path, 87 | f'{info_prefix}_dbinfos_train.pkl') 88 | 89 | database_pts_path = osp.join(database_save_path, 'pts_dir') 90 | database_img_path = osp.join(database_save_path, 'img_dir') 91 | mmcv.mkdir_or_exist(database_save_path) 92 | mmcv.mkdir_or_exist(database_pts_path) 93 | mmcv.mkdir_or_exist(database_img_path) 94 | all_db_infos = dict() 95 | 96 | group_counter = 0 97 | for j in track_iter_progress(list(range(len(dataset)))): 98 | input_dict = dataset.get_data_info(j) 99 | dataset.pre_pipeline(input_dict) 100 | example = dataset.pipeline(input_dict) 101 | annos = example['ann_info'] 102 | image_idx = example['sample_idx'] 103 | points = example['points'].tensor.numpy() 104 | gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy() 105 | names = annos['gt_names'] 106 | group_dict = dict() 107 | if 'group_ids' in annos: 108 | group_ids = annos['group_ids'] 109 | else: 110 | group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) 111 | difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) 112 | if 'difficulty' in annos: 113 | difficulty = annos['difficulty'] 114 | 115 | num_obj = gt_boxes_3d.shape[0] 116 | point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) 117 | 118 | # load multi-view image 119 | input_img = {} 120 | input_info = {} 121 | for _cam in example['info']['cams']: 122 | cam_info = example['info']['cams'][_cam] 123 | _path = cam_info['data_path'] 124 | _img = mmcv.imread(_path, 'unchanged') 125 | input_img[_cam] = _img 126 | 127 | # obtain lidar to image transformation matrix 128 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) 129 | lidar2cam_t = cam_info[ 130 | 'sensor2lidar_translation'] @ lidar2cam_r.T 131 | lidar2cam_rt = np.eye(4) 132 | lidar2cam_rt[:3, :3] = lidar2cam_r.T 133 | lidar2cam_rt[3, :3] = -lidar2cam_t 134 | intrinsic = cam_info['cam_intrinsic'] 135 | viewpad = np.eye(4) 136 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic 137 | lidar2img_rt = (viewpad @ lidar2cam_rt.T) 138 | 139 | input_info[_cam] = { 140 | 'lidar2img': lidar2img_rt, 141 | 'lidar2cam': lidar2cam_rt, 142 | 'cam_intrinsic': viewpad} 143 | 144 | for i in range(num_obj): 145 | pts_filename = f'{image_idx}_{names[i]}_{i}.bin' 146 | img_filename = f'{image_idx}_{names[i]}_{i}.png' 147 | abs_filepath = osp.join(database_pts_path, pts_filename) 148 | abs_img_filepath = osp.join(database_img_path, img_filename) 149 | rel_filepath = osp.join(f'{info_prefix}_gt_database', 'pts_dir', pts_filename) 150 | rel_img_filepath = osp.join(f'{info_prefix}_gt_database', 'img_dir', img_filename) 151 | 152 | # save point clouds and image patches for each object 153 | gt_points = points[point_indices[:, i]] 154 | gt_points[:, :3] -= gt_boxes_3d[i, :3] 155 | 156 | with open(abs_filepath, 'w') as f: 157 | gt_points.tofile(f) 158 | 159 | img_crop, crop_key, crop_depth = find_img_crop(annos['gt_bboxes_3d'][i].corners.numpy(), input_img, 160 | input_info, points[point_indices[:, i]]) 161 | if img_crop is not None: 162 | mmcv.imwrite(img_crop, abs_img_filepath) 163 | 164 | if (used_classes is None) or names[i] in used_classes: 165 | db_info = { 166 | 'name': names[i], 167 | 'path': rel_filepath, 168 | 'image_idx': image_idx, 169 | 'image_path': rel_img_filepath if img_crop is not None else '', 170 | 'image_crop_key': crop_key if img_crop is not None else '', 171 | 'image_crop_depth': crop_depth, 172 | 'gt_idx': i, 173 | 'box3d_lidar': gt_boxes_3d[i], 174 | 'num_points_in_gt': gt_points.shape[0], 175 | 'difficulty': difficulty[i], 176 | } 177 | local_group_id = group_ids[i] 178 | # if local_group_id >= 0: 179 | if local_group_id not in group_dict: 180 | group_dict[local_group_id] = group_counter 181 | group_counter += 1 182 | db_info['group_id'] = group_dict[local_group_id] 183 | if 'score' in annos: 184 | db_info['score'] = annos['score'][i] 185 | if names[i] in all_db_infos: 186 | all_db_infos[names[i]].append(db_info) 187 | else: 188 | all_db_infos[names[i]] = [db_info] 189 | 190 | for k, v in all_db_infos.items(): 191 | print(f'load {len(v)} {k} database infos') 192 | 193 | with open(db_info_save_path, 'wb') as f: 194 | pickle.dump(all_db_infos, f) 195 | 196 | 197 | def find_img_crop(gt_boxes_3d, input_img, input_info, points): 198 | coord_3d = np.concatenate([gt_boxes_3d, np.ones_like(gt_boxes_3d[..., :1])], -1) 199 | coord_3d = coord_3d.squeeze(0) 200 | max_crop, crop_key = None, None 201 | crop_area, crop_depth = 0, 0 202 | 203 | for _key in input_img: 204 | lidar2img = np.array(input_info[_key]['lidar2img']) 205 | coord_img = coord_3d @ lidar2img.T 206 | coord_img[:, :2] /= coord_img[:, 2, None] 207 | image_shape = input_img[_key].shape 208 | if (coord_img[2] <= 0).any(): 209 | continue 210 | 211 | avg_depth = coord_img[:, 2].mean() 212 | minxy = np.min(coord_img[:, :2], axis=-2) 213 | maxxy = np.max(coord_img[:, :2], axis=-2) 214 | bbox = np.concatenate([minxy, maxxy], axis=-1) 215 | bbox[0::2] = np.clip(bbox[0::2], a_min=0, a_max=image_shape[1] - 1) 216 | bbox[1::2] = np.clip(bbox[1::2], a_min=0, a_max=image_shape[0] - 1) 217 | bbox = bbox.astype(int) 218 | if ((bbox[2:] - bbox[:2]) <= 10).any(): 219 | continue 220 | 221 | img_crop = input_img[_key][bbox[1]:bbox[3], bbox[0]:bbox[2]] 222 | if img_crop.shape[0] * img_crop.shape[1] > crop_area: 223 | max_crop = img_crop 224 | crop_key = _key 225 | crop_depth = avg_depth 226 | 227 | return max_crop, crop_key, crop_depth 228 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/meformer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | import torch 8 | import torch.nn.functional as F 9 | from mmcv.runner import force_fp32, auto_fp16 10 | from mmdet.models import DETECTORS 11 | from mmdet3d.core import bbox3d2result 12 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector 13 | 14 | from projects.mmdet3d_plugin import SPConvVoxelization 15 | from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask 16 | 17 | 18 | @DETECTORS.register_module() 19 | class MEFormerDetector(MVXTwoStageDetector): 20 | def __init__(self, 21 | use_grid_mask=False, 22 | **kwargs): 23 | pts_voxel_cfg = kwargs.get('pts_voxel_layer', None) 24 | kwargs['pts_voxel_layer'] = None 25 | super(MEFormerDetector, self).__init__(**kwargs) 26 | 27 | self.use_grid_mask = use_grid_mask 28 | self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) 29 | if pts_voxel_cfg: 30 | self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg) 31 | 32 | def init_weights(self): 33 | """Initialize model weights.""" 34 | super(MEFormerDetector, self).init_weights() 35 | 36 | @auto_fp16(apply_to=('img'), out_fp32=True) 37 | def extract_img_feat(self, img, img_metas): 38 | """Extract features of images.""" 39 | if self.with_img_backbone and img is not None: 40 | input_shape = img.shape[-2:] 41 | # update real input shape of each single img 42 | for img_meta in img_metas: 43 | img_meta.update(input_shape=input_shape) 44 | 45 | if img.dim() == 5 and img.size(0) == 1: 46 | img.squeeze_(0) 47 | elif img.dim() == 5 and img.size(0) > 1: 48 | B, N, C, H, W = img.size() 49 | img = img.view(B * N, C, H, W) 50 | if self.use_grid_mask: 51 | img = self.grid_mask(img) 52 | img_feats = self.img_backbone(img.float()) 53 | if isinstance(img_feats, dict): 54 | img_feats = list(img_feats.values()) 55 | else: 56 | return None 57 | if self.with_img_neck: 58 | img_feats = self.img_neck(img_feats) 59 | return img_feats 60 | 61 | @force_fp32(apply_to=('pts', 'img_feats')) 62 | def extract_pts_feat(self, pts, img_feats, img_metas): 63 | """Extract features of points.""" 64 | if not self.with_pts_bbox: 65 | return None 66 | if pts is None: 67 | return None 68 | voxels, num_points, coors = self.voxelize(pts) 69 | voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) 70 | batch_size = coors[-1, 0] + 1 71 | x = self.pts_middle_encoder(voxel_features, coors, batch_size) 72 | x = self.pts_backbone(x) 73 | if self.with_pts_neck: 74 | x = self.pts_neck(x) 75 | return x 76 | 77 | @torch.no_grad() 78 | @force_fp32() 79 | def voxelize(self, points): 80 | """Apply dynamic voxelization to points. 81 | 82 | Args: 83 | points (list[torch.Tensor]): Points of each sample. 84 | 85 | Returns: 86 | tuple[torch.Tensor]: Concatenated points, number of points 87 | per voxel, and coordinates. 88 | """ 89 | voxels, coors, num_points = [], [], [] 90 | for res in points: 91 | res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) 92 | voxels.append(res_voxels) 93 | coors.append(res_coors) 94 | num_points.append(res_num_points) 95 | voxels = torch.cat(voxels, dim=0) 96 | num_points = torch.cat(num_points, dim=0) 97 | coors_batch = [] 98 | for i, coor in enumerate(coors): 99 | coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) 100 | coors_batch.append(coor_pad) 101 | coors_batch = torch.cat(coors_batch, dim=0) 102 | return voxels, num_points, coors_batch 103 | 104 | def forward_train(self, 105 | points=None, 106 | img_metas=None, 107 | gt_bboxes_3d=None, 108 | gt_labels_3d=None, 109 | gt_labels=None, 110 | gt_bboxes=None, 111 | img=None, 112 | proposals=None, 113 | gt_bboxes_ignore=None): 114 | """Forward training function. 115 | 116 | Args: 117 | points (list[torch.Tensor], optional): Points of each sample. 118 | Defaults to None. 119 | img_metas (list[dict], optional): Meta information of each sample. 120 | Defaults to None. 121 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): 122 | Ground truth 3D boxes. Defaults to None. 123 | gt_labels_3d (list[torch.Tensor], optional): Ground truth labels 124 | of 3D boxes. Defaults to None. 125 | gt_labels (list[torch.Tensor], optional): Ground truth labels 126 | of 2D boxes in images. Defaults to None. 127 | gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in 128 | images. Defaults to None. 129 | img (torch.Tensor optional): Images of each sample with shape 130 | (N, C, H, W). Defaults to None. 131 | proposals ([list[torch.Tensor], optional): Predicted proposals 132 | used for training Fast RCNN. Defaults to None. 133 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 134 | 2D boxes in images to be ignored. Defaults to None. 135 | 136 | Returns: 137 | dict: Losses of different branches. 138 | """ 139 | 140 | img_feats, pts_feats = self.extract_feat( 141 | points, img=img, img_metas=img_metas) 142 | losses = dict() 143 | if pts_feats or img_feats: 144 | losses_pts = self.forward_pts_train( 145 | pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore 146 | ) 147 | losses.update(losses_pts) 148 | return losses 149 | 150 | @force_fp32(apply_to=('pts_feats', 'img_feats')) 151 | def forward_pts_train(self, 152 | pts_feats, 153 | img_feats, 154 | gt_bboxes_3d, 155 | gt_labels_3d, 156 | img_metas, 157 | gt_bboxes_ignore=None): 158 | """Forward function for point cloud branch. 159 | 160 | Args: 161 | pts_feats (list[torch.Tensor]): Features of point cloud branch 162 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth 163 | boxes for each sample. 164 | gt_labels_3d (list[torch.Tensor]): Ground truth labels for 165 | boxes of each sampole 166 | img_metas (list[dict]): Meta information of samples. 167 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 168 | boxes to be ignored. Defaults to None. 169 | 170 | Returns: 171 | dict: Losses of each branch. 172 | """ 173 | if pts_feats is None: 174 | pts_feats = [None] 175 | if img_feats is None: 176 | img_feats = [None] 177 | outs = self.pts_bbox_head(pts_feats, img_feats, img_metas) 178 | loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] 179 | losses = self.pts_bbox_head.loss(*loss_inputs) 180 | return losses 181 | 182 | def forward_test(self, 183 | points=None, 184 | img_metas=None, 185 | img=None, **kwargs): 186 | """ 187 | Args: 188 | points (list[torch.Tensor]): the outer list indicates test-time 189 | augmentations and inner torch.Tensor should have a shape NxC, 190 | which contains all points in the batch. 191 | img_metas (list[list[dict]]): the outer list indicates test-time 192 | augs (multiscale, flip, etc.) and the inner list indicates 193 | images in a batch 194 | img (list[torch.Tensor], optional): the outer 195 | list indicates test-time augmentations and inner 196 | torch.Tensor should have a shape NxCxHxW, which contains 197 | all images in the batch. Defaults to None. 198 | """ 199 | if points is None: 200 | points = [None] 201 | if img is None: 202 | img = [None] 203 | for var, name in [(points, 'points'), (img, 'img'), (img_metas, 'img_metas')]: 204 | if not isinstance(var, list): 205 | raise TypeError('{} must be a list, but got {}'.format( 206 | name, type(var))) 207 | 208 | return self.simple_test(points[0], img_metas[0], img[0], **kwargs) 209 | 210 | @force_fp32(apply_to=('x', 'x_img')) 211 | def simple_test_pts(self, x, x_img, img_metas, rescale=False): 212 | """Test function of point cloud branch.""" 213 | outs = self.pts_bbox_head(x, x_img, img_metas) 214 | bbox_list = self.pts_bbox_head.get_bboxes( 215 | outs, img_metas, rescale=rescale) 216 | bbox_results = [ 217 | bbox3d2result(bboxes, scores, labels) 218 | for bboxes, scores, labels in bbox_list 219 | ] 220 | return bbox_results 221 | 222 | def simple_test(self, points, img_metas, img=None, rescale=False): 223 | img_feats, pts_feats = self.extract_feat( 224 | points, img=img, img_metas=img_metas) 225 | if pts_feats is None: 226 | pts_feats = [None] 227 | if img_feats is None: 228 | img_feats = [None] 229 | 230 | bbox_list = [dict() for i in range(len(img_metas))] 231 | if (pts_feats or img_feats) and self.with_pts_bbox: 232 | bbox_pts = self.simple_test_pts( 233 | pts_feats, img_feats, img_metas, rescale=rescale) 234 | for result_dict, pts_bbox in zip(bbox_list, bbox_pts): 235 | result_dict['pts_bbox'] = pts_bbox 236 | if img_feats and self.with_img_bbox: 237 | bbox_img = self.simple_test_img( 238 | img_feats, img_metas, rescale=rescale) 239 | for result_dict, img_bbox in zip(bbox_list, bbox_img): 240 | result_dict['img_bbox'] = img_bbox 241 | return bbox_list 242 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/mome.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | 8 | import mmcv 9 | import copy 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import numpy as np 14 | 15 | from mmcv.runner import force_fp32, auto_fp16 16 | from mmdet.core import multi_apply 17 | from mmdet.models import DETECTORS 18 | from mmdet.models.builder import build_backbone 19 | from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, 20 | merge_aug_bboxes_3d, show_result) 21 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector 22 | 23 | from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask 24 | from projects.mmdet3d_plugin import SPConvVoxelization 25 | 26 | 27 | @DETECTORS.register_module() 28 | class MoME(MVXTwoStageDetector): 29 | 30 | def __init__(self, 31 | use_grid_mask=False, 32 | **kwargs): 33 | pts_voxel_cfg = kwargs.get('pts_voxel_layer', None) 34 | kwargs['pts_voxel_layer'] = None 35 | super(MoME, self).__init__(**kwargs) 36 | 37 | self.use_grid_mask = use_grid_mask 38 | self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) 39 | if pts_voxel_cfg: 40 | self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg) 41 | 42 | def init_weights(self): 43 | """Initialize model weights.""" 44 | super(MoME, self).init_weights() 45 | 46 | @auto_fp16(apply_to=('img'), out_fp32=True) 47 | def extract_img_feat(self, img, img_metas): 48 | """Extract features of images.""" 49 | if self.with_img_backbone and img is not None: 50 | input_shape = img.shape[-2:] 51 | # update real input shape of each single img 52 | for img_meta in img_metas: 53 | img_meta.update(input_shape=input_shape) 54 | 55 | if img.dim() == 5 and img.size(0) == 1: 56 | img.squeeze_(0) 57 | elif img.dim() == 5 and img.size(0) > 1: 58 | B, N, C, H, W = img.size() 59 | img = img.view(B * N, C, H, W) 60 | if self.use_grid_mask: 61 | img = self.grid_mask(img) 62 | img_feats = self.img_backbone(img.float()) 63 | if isinstance(img_feats, dict): 64 | img_feats = list(img_feats.values()) 65 | else: 66 | return None 67 | if self.with_img_neck: 68 | img_feats = self.img_neck(img_feats) 69 | return img_feats 70 | 71 | @force_fp32(apply_to=('pts', 'img_feats')) 72 | def extract_pts_feat(self, pts, img_feats, img_metas): 73 | """Extract features of points.""" 74 | if not self.with_pts_bbox: 75 | return None 76 | if pts is None: 77 | return None 78 | voxels, num_points, coors = self.voxelize(pts) 79 | voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, 80 | ) 81 | batch_size = coors[-1, 0] + 1 82 | x = self.pts_middle_encoder(voxel_features, coors, batch_size) 83 | x = self.pts_backbone(x) 84 | if self.with_pts_neck: 85 | x = self.pts_neck(x) 86 | return x 87 | 88 | @torch.no_grad() 89 | @force_fp32() 90 | def voxelize(self, points): 91 | """Apply dynamic voxelization to points. 92 | 93 | Args: 94 | points (list[torch.Tensor]): Points of each sample. 95 | 96 | Returns: 97 | tuple[torch.Tensor]: Concatenated points, number of points 98 | per voxel, and coordinates. 99 | """ 100 | voxels, coors, num_points = [], [], [] 101 | for res in points: 102 | res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) 103 | voxels.append(res_voxels) 104 | coors.append(res_coors) 105 | num_points.append(res_num_points) 106 | voxels = torch.cat(voxels, dim=0) 107 | num_points = torch.cat(num_points, dim=0) 108 | coors_batch = [] 109 | for i, coor in enumerate(coors): 110 | coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) 111 | coors_batch.append(coor_pad) 112 | coors_batch = torch.cat(coors_batch, dim=0) 113 | return voxels, num_points, coors_batch 114 | 115 | def forward_train(self, 116 | points=None, 117 | img_metas=None, 118 | gt_bboxes_3d=None, 119 | gt_labels_3d=None, 120 | gt_labels=None, 121 | gt_bboxes=None, 122 | img=None, 123 | proposals=None, 124 | gt_bboxes_ignore=None): 125 | """Forward training function. 126 | 127 | Args: 128 | points (list[torch.Tensor], optional): Points of each sample. 129 | Defaults to None. 130 | img_metas (list[dict], optional): Meta information of each sample. 131 | Defaults to None. 132 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): 133 | Ground truth 3D boxes. Defaults to None. 134 | gt_labels_3d (list[torch.Tensor], optional): Ground truth labels 135 | of 3D boxes. Defaults to None. 136 | gt_labels (list[torch.Tensor], optional): Ground truth labels 137 | of 2D boxes in images. Defaults to None. 138 | gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in 139 | images. Defaults to None. 140 | img (torch.Tensor optional): Images of each sample with shape 141 | (N, C, H, W). Defaults to None. 142 | proposals ([list[torch.Tensor], optional): Predicted proposals 143 | used for training Fast RCNN. Defaults to None. 144 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 145 | 2D boxes in images to be ignored. Defaults to None. 146 | 147 | Returns: 148 | dict: Losses of different branches. 149 | """ 150 | 151 | img_feats, pts_feats = self.extract_feat( 152 | points, img=img, img_metas=img_metas) 153 | losses = dict() 154 | if pts_feats or img_feats: 155 | losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d, 156 | gt_labels_3d, img_metas, 157 | gt_bboxes_ignore) 158 | losses.update(losses_pts) 159 | return losses 160 | 161 | @force_fp32(apply_to=('pts_feats', 'img_feats')) 162 | def forward_pts_train(self, 163 | pts_feats, 164 | img_feats, 165 | gt_bboxes_3d, 166 | gt_labels_3d, 167 | img_metas, 168 | gt_bboxes_ignore=None): 169 | """Forward function for point cloud branch. 170 | 171 | Args: 172 | pts_feats (list[torch.Tensor]): Features of point cloud branch 173 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth 174 | boxes for each sample. 175 | gt_labels_3d (list[torch.Tensor]): Ground truth labels for 176 | boxes of each sampole 177 | img_metas (list[dict]): Meta information of samples. 178 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 179 | boxes to be ignored. Defaults to None. 180 | 181 | Returns: 182 | dict: Losses of each branch. 183 | """ 184 | if pts_feats is None: 185 | pts_feats = [None] 186 | if img_feats is None: 187 | img_feats = [None] 188 | outs = self.pts_bbox_head(pts_feats, img_feats, img_metas) 189 | loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] 190 | losses = self.pts_bbox_head.loss(*loss_inputs) 191 | return losses 192 | 193 | def forward_test(self, 194 | points=None, 195 | img_metas=None, 196 | img=None, **kwargs): 197 | """ 198 | Args: 199 | points (list[torch.Tensor]): the outer list indicates test-time 200 | augmentations and inner torch.Tensor should have a shape NxC, 201 | which contains all points in the batch. 202 | img_metas (list[list[dict]]): the outer list indicates test-time 203 | augs (multiscale, flip, etc.) and the inner list indicates 204 | images in a batch 205 | img (list[torch.Tensor], optional): the outer 206 | list indicates test-time augmentations and inner 207 | torch.Tensor should have a shape NxCxHxW, which contains 208 | all images in the batch. Defaults to None. 209 | """ 210 | if points is None: 211 | points = [None] 212 | if img is None: 213 | img = [None] 214 | for var, name in [(points, 'points'), (img, 'img'), (img_metas, 'img_metas')]: 215 | if not isinstance(var, list): 216 | raise TypeError('{} must be a list, but got {}'.format( 217 | name, type(var))) 218 | 219 | return self.simple_test(points[0], img_metas[0], img[0], **kwargs) 220 | 221 | @force_fp32(apply_to=('x', 'x_img')) 222 | def simple_test_pts(self, x, x_img, img_metas, rescale=False): 223 | """Test function of point cloud branch.""" 224 | outs = self.pts_bbox_head(x, x_img, img_metas) 225 | bbox_list = self.pts_bbox_head.get_bboxes( 226 | outs, img_metas, rescale=rescale) 227 | bbox_results = [ 228 | bbox3d2result(bboxes, scores, labels) 229 | for bboxes, scores, labels in bbox_list 230 | ] 231 | return bbox_results 232 | 233 | def simple_test(self, points, img_metas, img=None, rescale=False): 234 | img_feats, pts_feats = self.extract_feat( 235 | points, img=img, img_metas=img_metas) 236 | if pts_feats is None: 237 | pts_feats = [None] 238 | if img_feats is None: 239 | img_feats = [None] 240 | 241 | bbox_list = [dict() for i in range(len(img_metas))] 242 | if (pts_feats or img_feats) and self.with_pts_bbox: 243 | bbox_pts = self.simple_test_pts( 244 | pts_feats, img_feats, img_metas, rescale=rescale) 245 | for result_dict, pts_bbox in zip(bbox_list, bbox_pts): 246 | result_dict['pts_bbox'] = pts_bbox 247 | if img_feats and self.with_img_bbox: 248 | bbox_img = self.simple_test_img( 249 | img_feats, img_metas, rescale=rescale) 250 | for result_dict, img_bbox in zip(bbox_list, bbox_img): 251 | result_dict['img_bbox'] = img_bbox 252 | return bbox_list 253 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/dbsampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import copy 3 | import os 4 | 5 | import mmcv 6 | import numpy as np 7 | from mmdet3d.core.bbox import box_np_ops 8 | from mmdet3d.datasets import PIPELINES 9 | from mmdet3d.datasets.builder import OBJECTSAMPLERS 10 | from mmdet3d.datasets.pipelines import data_augment_utils 11 | from mmdet3d.datasets.pipelines.dbsampler import BatchSampler 12 | 13 | 14 | @OBJECTSAMPLERS.register_module() 15 | class UnifiedDataBaseSampler(object): 16 | """Class for sampling data from the ground truth database. 17 | 18 | Args: 19 | info_path (str): Path of groundtruth database info. 20 | data_root (str): Path of groundtruth database. 21 | rate (float): Rate of actual sampled over maximum sampled number. 22 | prepare (dict): Name of preparation functions and the input value. 23 | sample_groups (dict): Sampled classes and numbers. 24 | classes (list[str]): List of classes. Default: None. 25 | points_loader(dict): Config of points loader. Default: dict( 26 | type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3]) 27 | """ 28 | 29 | def __init__(self, 30 | info_path, 31 | data_root, 32 | rate, 33 | prepare, 34 | sample_groups, 35 | classes=None, 36 | points_loader=dict( 37 | type='LoadPointsFromFile', 38 | coord_type='LIDAR', 39 | load_dim=4, 40 | use_dim=[0, 1, 2, 3])): 41 | super().__init__() 42 | self.data_root = data_root 43 | self.info_path = info_path 44 | self.rate = rate 45 | self.prepare = prepare 46 | self.classes = classes 47 | self.cat2label = {name: i for i, name in enumerate(classes)} 48 | self.label2cat = {i: name for i, name in enumerate(classes)} 49 | self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES) 50 | 51 | db_infos = mmcv.load(info_path) 52 | 53 | # filter database infos 54 | from mmdet3d.utils import get_root_logger 55 | logger = get_root_logger() 56 | for k, v in db_infos.items(): 57 | logger.info(f'load {len(v)} {k} database infos') 58 | for prep_func, val in prepare.items(): 59 | db_infos = getattr(self, prep_func)(db_infos, val) 60 | logger.info('After filter database:') 61 | for k, v in db_infos.items(): 62 | logger.info(f'load {len(v)} {k} database infos') 63 | 64 | self.db_infos = db_infos 65 | 66 | # load sample groups 67 | # TODO: more elegant way to load sample groups 68 | self.sample_groups = [] 69 | for name, num in sample_groups.items(): 70 | self.sample_groups.append({name: int(num)}) 71 | 72 | self.group_db_infos = self.db_infos # just use db_infos 73 | self.sample_classes = [] 74 | self.sample_max_nums = [] 75 | for group_info in self.sample_groups: 76 | self.sample_classes += list(group_info.keys()) 77 | self.sample_max_nums += list(group_info.values()) 78 | 79 | self.sampler_dict = {} 80 | for k, v in self.group_db_infos.items(): 81 | self.sampler_dict[k] = BatchSampler(v, k, shuffle=True) 82 | # TODO: No group_sampling currently 83 | 84 | @staticmethod 85 | def filter_by_difficulty(db_infos, removed_difficulty): 86 | """Filter ground truths by difficulties. 87 | 88 | Args: 89 | db_infos (dict): Info of groundtruth database. 90 | removed_difficulty (list): Difficulties that are not qualified. 91 | 92 | Returns: 93 | dict: Info of database after filtering. 94 | """ 95 | new_db_infos = {} 96 | for key, dinfos in db_infos.items(): 97 | new_db_infos[key] = [ 98 | info for info in dinfos 99 | if info['difficulty'] not in removed_difficulty 100 | ] 101 | return new_db_infos 102 | 103 | @staticmethod 104 | def filter_by_min_points(db_infos, min_gt_points_dict): 105 | """Filter ground truths by number of points in the bbox. 106 | 107 | Args: 108 | db_infos (dict): Info of groundtruth database. 109 | min_gt_points_dict (dict): Different number of minimum points 110 | needed for different categories of ground truths. 111 | 112 | Returns: 113 | dict: Info of database after filtering. 114 | """ 115 | for name, min_num in min_gt_points_dict.items(): 116 | min_num = int(min_num) 117 | if min_num > 0: 118 | filtered_infos = [] 119 | for info in db_infos[name]: 120 | if info['num_points_in_gt'] >= min_num: 121 | filtered_infos.append(info) 122 | db_infos[name] = filtered_infos 123 | return db_infos 124 | 125 | def sample_all(self, gt_bboxes, gt_labels, with_img=False): 126 | """Sampling all categories of bboxes. 127 | 128 | Args: 129 | gt_bboxes (np.ndarray): Ground truth bounding boxes. 130 | gt_labels (np.ndarray): Ground truth labels of boxes. 131 | 132 | Returns: 133 | dict: Dict of sampled 'pseudo ground truths'. 134 | 135 | - gt_labels_3d (np.ndarray): ground truths labels \ 136 | of sampled objects. 137 | - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): \ 138 | sampled ground truth 3D bounding boxes 139 | - points (np.ndarray): sampled points 140 | - group_ids (np.ndarray): ids of sampled ground truths 141 | """ 142 | sampled_num_dict = {} 143 | sample_num_per_class = [] 144 | 145 | for class_name, max_sample_num in zip(self.sample_classes, 146 | self.sample_max_nums): 147 | class_label = self.cat2label[class_name] 148 | sampled_num = int(max_sample_num - 149 | np.sum([n == class_label for n in gt_labels])) 150 | sampled_num = np.round(self.rate * sampled_num).astype(np.int64) 151 | sampled_num_dict[class_name] = sampled_num 152 | sample_num_per_class.append(sampled_num) 153 | 154 | sampled = [] 155 | sampled_gt_bboxes = [] 156 | avoid_coll_boxes = gt_bboxes 157 | 158 | for class_name, sampled_num in zip(self.sample_classes, 159 | sample_num_per_class): 160 | if sampled_num > 0: 161 | sampled_cls = self.sample_class_v2(class_name, sampled_num, 162 | avoid_coll_boxes) 163 | 164 | sampled += sampled_cls 165 | if len(sampled_cls) > 0: 166 | if len(sampled_cls) == 1: 167 | sampled_gt_box = sampled_cls[0]['box3d_lidar'][ 168 | np.newaxis, ...] 169 | else: 170 | sampled_gt_box = np.stack( 171 | [s['box3d_lidar'] for s in sampled_cls], axis=0) 172 | 173 | sampled_gt_bboxes += [sampled_gt_box] 174 | avoid_coll_boxes = np.concatenate( 175 | [avoid_coll_boxes, sampled_gt_box], axis=0) 176 | 177 | ret = None 178 | if len(sampled) > 0: 179 | sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0) 180 | s_points_list = [] 181 | s_idx_list = [] 182 | s_imgs_list = [] 183 | count = 0 184 | for info in sampled: 185 | file_path = os.path.join( 186 | self.data_root, 187 | info['path']) if self.data_root else info['path'] 188 | results = dict(pts_filename=file_path) 189 | if 'nori_id' in info: 190 | results['pts_nori_path'] = info['nori_id'] 191 | s_points = self.points_loader(results)['points'] 192 | s_points.translate(info['box3d_lidar'][:3]) 193 | idx_points = count * np.ones(len(s_points), dtype=np.int) 194 | s_points_list.append(s_points) 195 | s_idx_list.append(idx_points) 196 | count += 1 197 | if with_img: 198 | if len(info['image_path']) > 0: 199 | img_path = os.path.join( 200 | self.data_root, 201 | info['image_path']) if self.data_root else info['image_path'] 202 | s_img = mmcv.imread(img_path, 'unchanged') 203 | else: 204 | s_img = [] 205 | s_imgs_list.append(s_img) 206 | 207 | gt_labels = np.array([self.cat2label[s['name']] for s in sampled], 208 | dtype=np.long) 209 | ret = { 210 | 'gt_labels_3d': 211 | gt_labels, 212 | 'gt_bboxes_3d': 213 | sampled_gt_bboxes, 214 | 'points': 215 | s_points_list[0].cat(s_points_list), 216 | "points_idx": 217 | np.concatenate(s_idx_list, axis=0), 218 | 'images': 219 | s_imgs_list, 220 | 'group_ids': 221 | np.arange(gt_bboxes.shape[0], 222 | gt_bboxes.shape[0] + len(sampled)) 223 | } 224 | 225 | return ret 226 | 227 | def sample_class_v2(self, name, num, gt_bboxes): 228 | """Sampling specific categories of bounding boxes. 229 | 230 | Args: 231 | name (str): Class of objects to be sampled. 232 | num (int): Number of sampled bboxes. 233 | gt_bboxes (np.ndarray): Ground truth boxes. 234 | 235 | Returns: 236 | list[dict]: Valid samples after collision test. 237 | """ 238 | sampled = self.sampler_dict[name].sample(num) 239 | sampled = copy.deepcopy(sampled) 240 | num_gt = gt_bboxes.shape[0] 241 | num_sampled = len(sampled) 242 | gt_bboxes_bv = box_np_ops.center_to_corner_box2d( 243 | gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6]) 244 | 245 | sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0) 246 | boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy() 247 | 248 | sp_boxes_new = boxes[gt_bboxes.shape[0]:] 249 | sp_boxes_bv = box_np_ops.center_to_corner_box2d( 250 | sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6]) 251 | 252 | total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0) 253 | coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv) 254 | diag = np.arange(total_bv.shape[0]) 255 | coll_mat[diag, diag] = False 256 | 257 | valid_samples = [] 258 | for i in range(num_gt, num_gt + num_sampled): 259 | if coll_mat[i].any(): 260 | coll_mat[i] = False 261 | coll_mat[:, i] = False 262 | else: 263 | valid_samples.append(sampled[i - num_gt]) 264 | return valid_samples 265 | -------------------------------------------------------------------------------- /tools/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import warnings 5 | 6 | import mmcv 7 | import mmdet 8 | import torch 9 | from mmcv import Config, DictAction 10 | from mmcv.cnn import fuse_conv_bn 11 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 12 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, 13 | wrap_fp16_model) 14 | from mmdet.apis import multi_gpu_test, set_random_seed 15 | from mmdet.datasets import replace_ImageToTensor 16 | from mmdet3d.apis import single_gpu_test 17 | from mmdet3d.datasets import build_dataloader, build_dataset 18 | from mmdet3d.models import build_model 19 | 20 | if mmdet.__version__ > '2.23.0': 21 | # If mmdet version > 2.23.0, setup_multi_processes would be imported and 22 | # used from mmdet instead of mmdet3d. 23 | from mmdet.utils import setup_multi_processes 24 | else: 25 | from mmdet3d.utils import setup_multi_processes 26 | 27 | try: 28 | # If mmdet version > 2.23.0, compat_cfg would be imported and 29 | # used from mmdet instead of mmdet3d. 30 | from mmdet.utils import compat_cfg 31 | except ImportError: 32 | from mmdet3d.utils import compat_cfg 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser( 37 | description='MMDet test (and eval) a model') 38 | parser.add_argument('config', help='test config file path') 39 | parser.add_argument('checkpoint', help='checkpoint file') 40 | parser.add_argument('--out', help='output result file in pickle format') 41 | parser.add_argument( 42 | '--fuse-conv-bn', 43 | action='store_true', 44 | help='Whether to fuse conv and bn, this will slightly increase' 45 | 'the inference speed') 46 | parser.add_argument( 47 | '--gpu-ids', 48 | type=int, 49 | nargs='+', 50 | help='(Deprecated, please use --gpu-id) ids of gpus to use ' 51 | '(only applicable to non-distributed training)') 52 | parser.add_argument( 53 | '--gpu-id', 54 | type=int, 55 | default=0, 56 | help='id of gpu to use ' 57 | '(only applicable to non-distributed testing)') 58 | parser.add_argument( 59 | '--format-only', 60 | action='store_true', 61 | help='Format the output results without perform evaluation. It is' 62 | 'useful when you want to format the result to a specific format and ' 63 | 'submit it to the test server') 64 | parser.add_argument( 65 | '--eval', 66 | type=str, 67 | nargs='+', 68 | help='evaluation metrics, which depends on the dataset, e.g., "bbox",' 69 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 70 | parser.add_argument('--show', action='store_true', help='show results') 71 | parser.add_argument( 72 | '--show-dir', help='directory where results will be saved') 73 | parser.add_argument( 74 | '--gpu-collect', 75 | action='store_true', 76 | help='whether to use gpu to collect results.') 77 | parser.add_argument( 78 | '--tmpdir', 79 | help='tmp directory used for collecting results from multiple ' 80 | 'workers, available when gpu-collect is not specified') 81 | parser.add_argument('--seed', type=int, default=0, help='random seed') 82 | parser.add_argument( 83 | '--deterministic', 84 | action='store_true', 85 | help='whether to set deterministic options for CUDNN backend.') 86 | parser.add_argument( 87 | '--cfg-options', 88 | nargs='+', 89 | action=DictAction, 90 | help='override some settings in the used config, the key-value pair ' 91 | 'in xxx=yyy format will be merged into config file. If the value to ' 92 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 93 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 94 | 'Note that the quotation marks are necessary and that no white space ' 95 | 'is allowed.') 96 | parser.add_argument( 97 | '--options', 98 | nargs='+', 99 | action=DictAction, 100 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 101 | 'format will be kwargs for dataset.evaluate() function (deprecate), ' 102 | 'change to --eval-options instead.') 103 | parser.add_argument( 104 | '--eval-options', 105 | nargs='+', 106 | action=DictAction, 107 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 108 | 'format will be kwargs for dataset.evaluate() function') 109 | parser.add_argument( 110 | '--launcher', 111 | choices=['none', 'pytorch', 'slurm', 'mpi'], 112 | default='none', 113 | help='job launcher') 114 | parser.add_argument('--local_rank', type=int, default=0) 115 | args = parser.parse_args() 116 | if 'LOCAL_RANK' not in os.environ: 117 | os.environ['LOCAL_RANK'] = str(args.local_rank) 118 | 119 | if args.options and args.eval_options: 120 | raise ValueError( 121 | '--options and --eval-options cannot be both specified, ' 122 | '--options is deprecated in favor of --eval-options') 123 | if args.options: 124 | warnings.warn('--options is deprecated in favor of --eval-options') 125 | args.eval_options = args.options 126 | return args 127 | 128 | 129 | def main(): 130 | args = parse_args() 131 | 132 | assert args.out or args.eval or args.format_only or args.show \ 133 | or args.show_dir, \ 134 | ('Please specify at least one operation (save/eval/format/show the ' 135 | 'results / save the results) with the argument "--out", "--eval"' 136 | ', "--format-only", "--show" or "--show-dir"') 137 | 138 | if args.eval and args.format_only: 139 | raise ValueError('--eval and --format_only cannot be both specified') 140 | 141 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 142 | raise ValueError('The output file must be a pkl file.') 143 | 144 | cfg = Config.fromfile(args.config) 145 | if args.cfg_options is not None: 146 | cfg.merge_from_dict(args.cfg_options) 147 | 148 | # import modules from string list. 149 | if cfg.get('custom_imports', None): 150 | from mmcv.utils import import_modules_from_strings 151 | import_modules_from_strings(**cfg['custom_imports']) 152 | 153 | # import modules from plguin/xx, registry will be updated 154 | if hasattr(cfg, 'plugin'): 155 | if cfg.plugin: 156 | import importlib 157 | if hasattr(cfg, 'plugin_dir'): 158 | plugin_dir = cfg.plugin_dir 159 | _module_dir = os.path.dirname(plugin_dir) 160 | _module_dir = _module_dir.split('/') 161 | _module_path = _module_dir[0] 162 | 163 | for m in _module_dir[1:]: 164 | _module_path = _module_path + '.' + m 165 | print(_module_path) 166 | plg_lib = importlib.import_module(_module_path) 167 | else: 168 | # import dir is the dirpath for the config file 169 | _module_dir = os.path.dirname(args.config) 170 | _module_dir = _module_dir.split('/') 171 | _module_path = _module_dir[0] 172 | for m in _module_dir[1:]: 173 | _module_path = _module_path + '.' + m 174 | print(_module_path) 175 | plg_lib = importlib.import_module(_module_path) 176 | 177 | cfg = compat_cfg(cfg) 178 | 179 | # set multi-process settings 180 | setup_multi_processes(cfg) 181 | 182 | # set cudnn_benchmark 183 | if cfg.get('cudnn_benchmark', False): 184 | torch.backends.cudnn.benchmark = True 185 | 186 | cfg.model.pretrained = None 187 | 188 | if args.gpu_ids is not None: 189 | cfg.gpu_ids = args.gpu_ids[0:1] 190 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 191 | 'Because we only support single GPU mode in ' 192 | 'non-distributed testing. Use the first GPU ' 193 | 'in `gpu_ids` now.') 194 | else: 195 | cfg.gpu_ids = [args.gpu_id] 196 | 197 | # init distributed env first, since logger depends on the dist info. 198 | if args.launcher == 'none': 199 | distributed = False 200 | else: 201 | distributed = True 202 | init_dist(args.launcher, **cfg.dist_params) 203 | 204 | test_dataloader_default_args = dict( 205 | samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False) 206 | 207 | # in case the test dataset is concatenated 208 | if isinstance(cfg.data.test, dict): 209 | cfg.data.test.test_mode = True 210 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: 211 | # Replace 'ImageToTensor' to 'DefaultFormatBundle' 212 | cfg.data.test.pipeline = replace_ImageToTensor( 213 | cfg.data.test.pipeline) 214 | elif isinstance(cfg.data.test, list): 215 | for ds_cfg in cfg.data.test: 216 | ds_cfg.test_mode = True 217 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: 218 | for ds_cfg in cfg.data.test: 219 | ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) 220 | 221 | test_loader_cfg = { 222 | **test_dataloader_default_args, 223 | **cfg.data.get('test_dataloader', {}) 224 | } 225 | 226 | # set random seeds 227 | if args.seed is not None: 228 | set_random_seed(args.seed, deterministic=args.deterministic) 229 | 230 | # build the dataloader 231 | dataset = build_dataset(cfg.data.test) 232 | data_loader = build_dataloader(dataset, **test_loader_cfg) 233 | 234 | # build the model and load checkpoint 235 | cfg.model.train_cfg = None 236 | model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) 237 | fp16_cfg = cfg.get('fp16', None) 238 | if fp16_cfg is not None: 239 | wrap_fp16_model(model) 240 | if cfg.get('optimizer_config', None) is not None and cfg.optimizer_config['type'] == 'CustomFp16OptimizerHook': 241 | wrap_fp16_model(model) 242 | for module_name, v in cfg.optimizer_config['custom_fp16'].items(): 243 | model._modules[module_name].fp16_enabled = v 244 | 245 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 246 | if args.fuse_conv_bn: 247 | model = fuse_conv_bn(model) 248 | # old versions did not save class info in checkpoints, this walkaround is 249 | # for backward compatibility 250 | if 'CLASSES' in checkpoint.get('meta', {}): 251 | model.CLASSES = checkpoint['meta']['CLASSES'] 252 | else: 253 | model.CLASSES = dataset.CLASSES 254 | # palette for visualization in segmentation tasks 255 | if 'PALETTE' in checkpoint.get('meta', {}): 256 | model.PALETTE = checkpoint['meta']['PALETTE'] 257 | elif hasattr(dataset, 'PALETTE'): 258 | # segmentation dataset has `PALETTE` attribute 259 | model.PALETTE = dataset.PALETTE 260 | 261 | if not distributed: 262 | model = MMDataParallel(model, device_ids=cfg.gpu_ids) 263 | outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) 264 | else: 265 | model = MMDistributedDataParallel( 266 | model.cuda(), 267 | device_ids=[torch.cuda.current_device()], 268 | broadcast_buffers=False) 269 | outputs = multi_gpu_test(model, data_loader, args.tmpdir, 270 | args.gpu_collect) 271 | 272 | rank, _ = get_dist_info() 273 | if rank == 0: 274 | if args.out: 275 | print(f'\nwriting results to {args.out}') 276 | mmcv.dump(outputs, args.out) 277 | kwargs = {} if args.eval_options is None else args.eval_options 278 | if args.format_only: 279 | dataset.format_results(outputs, **kwargs) 280 | if args.eval: 281 | eval_kwargs = cfg.get('evaluation', {}).copy() 282 | # hard-code way to remove EvalHook args 283 | for key in [ 284 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 285 | 'rule' 286 | ]: 287 | eval_kwargs.pop(key, None) 288 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 289 | print(dataset.evaluate(outputs, **eval_kwargs)) 290 | 291 | 292 | if __name__ == '__main__': 293 | main() 294 | -------------------------------------------------------------------------------- /tools/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from __future__ import division 3 | 4 | import argparse 5 | import copy 6 | import datetime 7 | import os 8 | import time 9 | import warnings 10 | from os import path as osp 11 | 12 | import mmcv 13 | import torch 14 | import torch.distributed as dist 15 | from mmcv import Config, DictAction 16 | from mmcv.runner import get_dist_info, init_dist 17 | from mmdet import __version__ as mmdet_version 18 | from mmdet.apis import set_random_seed 19 | from mmdet3d import __version__ as mmdet3d_version 20 | from mmdet3d.apis import init_random_seed, train_model 21 | from mmdet3d.datasets import build_dataset 22 | from mmdet3d.models import build_model 23 | from mmdet3d.utils import collect_env, get_root_logger 24 | from mmseg import __version__ as mmseg_version 25 | 26 | try: 27 | # If mmdet version > 2.20.0, setup_multi_processes would be imported and 28 | # used from mmdet instead of mmdet3d. 29 | from mmdet.utils import setup_multi_processes 30 | except ImportError: 31 | from mmdet3d.utils import setup_multi_processes 32 | 33 | 34 | def parse_args(): 35 | parser = argparse.ArgumentParser(description='Train a detector') 36 | parser.add_argument('config', help='train config file path') 37 | parser.add_argument('--work-dir', help='the dir to save logs and models') 38 | parser.add_argument( 39 | '--resume-from', help='the checkpoint file to resume from') 40 | parser.add_argument( 41 | '--auto-resume', 42 | action='store_true', 43 | help='resume from the latest checkpoint automatically') 44 | parser.add_argument( 45 | '--no-validate', 46 | action='store_true', 47 | help='whether not to evaluate the checkpoint during training') 48 | group_gpus = parser.add_mutually_exclusive_group() 49 | group_gpus.add_argument( 50 | '--gpus', 51 | type=int, 52 | help='(Deprecated, please use --gpu-id) number of gpus to use ' 53 | '(only applicable to non-distributed training)') 54 | group_gpus.add_argument( 55 | '--gpu-ids', 56 | type=int, 57 | nargs='+', 58 | help='(Deprecated, please use --gpu-id) ids of gpus to use ' 59 | '(only applicable to non-distributed training)') 60 | group_gpus.add_argument( 61 | '--gpu-id', 62 | type=int, 63 | default=0, 64 | help='number of gpus to use ' 65 | '(only applicable to non-distributed training)') 66 | parser.add_argument('--seed', type=int, default=None, help='random seed') 67 | parser.add_argument( 68 | '--diff-seed', 69 | action='store_true', 70 | help='Whether or not set different seeds for different ranks') 71 | parser.add_argument( 72 | '--deterministic', 73 | action='store_true', 74 | help='whether to set deterministic options for CUDNN backend.') 75 | parser.add_argument( 76 | '--options', 77 | nargs='+', 78 | action=DictAction, 79 | help='override some settings in the used config, the key-value pair ' 80 | 'in xxx=yyy format will be merged into config file (deprecate), ' 81 | 'change to --cfg-options instead.') 82 | parser.add_argument( 83 | '--cfg-options', 84 | nargs='+', 85 | action=DictAction, 86 | help='override some settings in the used config, the key-value pair ' 87 | 'in xxx=yyy format will be merged into config file. If the value to ' 88 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 89 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 90 | 'Note that the quotation marks are necessary and that no white space ' 91 | 'is allowed.') 92 | parser.add_argument( 93 | '--launcher', 94 | choices=['none', 'pytorch', 'slurm', 'mpi'], 95 | default='none', 96 | help='job launcher') 97 | parser.add_argument('--local_rank', type=int, default=0) 98 | parser.add_argument( 99 | '--autoscale-lr', 100 | action='store_true', 101 | help='automatically scale lr with the number of gpus') 102 | parser.add_argument( 103 | '--debug', 104 | action='store_true', 105 | default=False, 106 | help='flag for debugging') 107 | parser.add_argument( 108 | '--batch-size', 109 | type=int, 110 | default=None, 111 | required=False, 112 | help='batch size for training') 113 | args = parser.parse_args() 114 | if 'LOCAL_RANK' not in os.environ: 115 | os.environ['LOCAL_RANK'] = str(args.local_rank) 116 | 117 | if args.options and args.cfg_options: 118 | raise ValueError( 119 | '--options and --cfg-options cannot be both specified, ' 120 | '--options is deprecated in favor of --cfg-options') 121 | if args.options: 122 | warnings.warn('--options is deprecated in favor of --cfg-options') 123 | args.cfg_options = args.options 124 | 125 | return args 126 | 127 | 128 | def main(): 129 | args = parse_args() 130 | 131 | cfg = Config.fromfile(args.config) 132 | if args.cfg_options is not None: 133 | cfg.merge_from_dict(args.cfg_options) 134 | 135 | # set multi-process settings 136 | setup_multi_processes(cfg) 137 | 138 | if cfg.get('custom_imports', None): 139 | from mmcv.utils import import_modules_from_strings 140 | import_modules_from_strings(**cfg['custom_imports']) 141 | 142 | # import modules from plguin/xx, registry will be updated 143 | if hasattr(cfg, 'plugin'): 144 | if cfg.plugin: 145 | import importlib 146 | if hasattr(cfg, 'plugin_dir'): 147 | plugin_dir = cfg.plugin_dir 148 | _module_dir = os.path.dirname(plugin_dir) 149 | _module_dir = _module_dir.split('/') 150 | _module_path = _module_dir[0] 151 | 152 | for m in _module_dir[1:]: 153 | _module_path = _module_path + '.' + m 154 | print(_module_path) 155 | plg_lib = importlib.import_module(_module_path) 156 | else: 157 | # import dir is the dirpath for the config file 158 | _module_dir = os.path.dirname(args.config) 159 | _module_dir = _module_dir.split('/') 160 | _module_path = _module_dir[0] 161 | for m in _module_dir[1:]: 162 | _module_path = _module_path + '.' + m 163 | print(_module_path) 164 | plg_lib = importlib.import_module(_module_path) 165 | 166 | plg_lib = importlib.import_module('mmdet3d') 167 | 168 | # set cudnn_benchmark 169 | if cfg.get('cudnn_benchmark', False): 170 | torch.backends.cudnn.benchmark = True 171 | 172 | # work_dir is determined in this priority: CLI > segment in file > filename 173 | if args.work_dir is not None: 174 | # update configs according to CLI args if args.work_dir is not None 175 | cfg.work_dir = args.work_dir 176 | elif cfg.get('work_dir', None) is None: 177 | # use config filename as default work_dir if cfg.work_dir is None 178 | cfg.work_dir = osp.join('./work_dirs', 179 | "debug" if args.debug else osp.splitext(osp.basename(args.config))[0]) 180 | if args.resume_from is not None: 181 | cfg.resume_from = args.resume_from 182 | 183 | if args.auto_resume: 184 | cfg.auto_resume = args.auto_resume 185 | warnings.warn('`--auto-resume` is only supported when mmdet' 186 | 'version >= 2.20.0 for 3D detection model or' 187 | 'mmsegmentation verision >= 0.21.0 for 3D' 188 | 'segmentation model') 189 | 190 | if args.gpus is not None: 191 | cfg.gpu_ids = range(1) 192 | warnings.warn('`--gpus` is deprecated because we only support ' 193 | 'single GPU mode in non-distributed training. ' 194 | 'Use `gpus=1` now.') 195 | if args.gpu_ids is not None: 196 | cfg.gpu_ids = args.gpu_ids[0:1] 197 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 198 | 'Because we only support single GPU mode in ' 199 | 'non-distributed training. Use the first GPU ' 200 | 'in `gpu_ids` now.') 201 | if args.gpus is None and args.gpu_ids is None: 202 | cfg.gpu_ids = [args.gpu_id] 203 | 204 | if args.autoscale_lr: 205 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) 206 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 207 | 208 | # init distributed env first, since logger depends on the dist info. 209 | if args.launcher == 'none': 210 | distributed = False 211 | cfg.work_dir = osp.join(cfg.work_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) 212 | world_size = 1 213 | else: 214 | distributed = True 215 | init_dist(args.launcher, **cfg.dist_params) 216 | # re-set gpu_ids with distributed training mode 217 | _, world_size = get_dist_info() 218 | cfg.gpu_ids = range(world_size) 219 | 220 | date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 221 | date_gather = [None for _ in range(world_size)] 222 | dist.all_gather_object(date_gather, date) 223 | cfg.work_dir = osp.join(cfg.work_dir, str(date_gather[0])) 224 | 225 | if args.batch_size is not None: 226 | cfg.data["samples_per_gpu"] = args.batch_size // world_size 227 | 228 | # create work_dir 229 | mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) 230 | # init the logger before other steps 231 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 232 | log_file = osp.join(cfg.work_dir, f'{timestamp}.log') 233 | # specify logger name, if we still use 'mmdet', the output info will be 234 | # filtered and won't be saved in the log_file 235 | # TODO: ugly workaround to judge whether we are training det or seg model 236 | if cfg.model.type in ['EncoderDecoder3D']: 237 | logger_name = 'mmseg' 238 | else: 239 | logger_name = 'mmdet' 240 | logger = get_root_logger( 241 | log_file=log_file, log_level=cfg.log_level, name=logger_name) 242 | 243 | # init the meta dict to record some important information such as 244 | # environment info and seed, which will be logged 245 | meta = dict() 246 | # log env info 247 | env_info_dict = collect_env() 248 | env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) 249 | dash_line = '-' * 60 + '\n' 250 | logger.info('Environment info:\n' + dash_line + env_info + '\n' + 251 | dash_line) 252 | meta['env_info'] = env_info 253 | meta['config'] = cfg.pretty_text 254 | 255 | # log some basic info 256 | logger.info(f'Distributed training: {distributed}') 257 | logger.info(f'Config:\n{cfg.pretty_text}') 258 | 259 | # set random seeds 260 | seed = init_random_seed(args.seed) 261 | seed = seed + dist.get_rank() if args.diff_seed else seed 262 | logger.info(f'Set random seed to {seed}, ' 263 | f'deterministic: {args.deterministic}') 264 | set_random_seed(seed, deterministic=args.deterministic) 265 | cfg.seed = seed 266 | meta['seed'] = seed 267 | meta['exp_name'] = osp.basename(args.config) 268 | 269 | # dump config 270 | cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) 271 | 272 | model = build_model( 273 | cfg.model, 274 | train_cfg=cfg.get('train_cfg'), 275 | test_cfg=cfg.get('test_cfg')) 276 | model.init_weights() 277 | 278 | logger.info(f'Model:\n{model}') 279 | datasets = [build_dataset(cfg.data.train)] 280 | if len(cfg.workflow) == 2: 281 | val_dataset = copy.deepcopy(cfg.data.val) 282 | # in case we use a dataset wrapper 283 | if 'dataset' in cfg.data.train: 284 | val_dataset.pipeline = cfg.data.train.dataset.pipeline 285 | else: 286 | val_dataset.pipeline = cfg.data.train.pipeline 287 | # set test_mode=False here in deep copied config 288 | # which do not affect AP/AR calc ulation later 289 | # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa 290 | val_dataset.test_mode = False 291 | datasets.append(build_dataset(val_dataset)) 292 | if cfg.checkpoint_config is not None: 293 | # save mmdet version, config file content and class names in 294 | # checkpoints as meta data 295 | cfg.checkpoint_config.meta = dict( 296 | mmdet_version=mmdet_version, 297 | mmseg_version=mmseg_version, 298 | mmdet3d_version=mmdet3d_version, 299 | config=cfg.pretty_text, 300 | CLASSES=datasets[0].CLASSES, 301 | PALETTE=datasets[0].PALETTE # for segmentors 302 | if hasattr(datasets[0], 'PALETTE') else None) 303 | # add an attribute for visualization convenience 304 | model.CLASSES = datasets[0].CLASSES 305 | train_model( 306 | model, 307 | datasets, 308 | cfg, 309 | distributed=distributed, 310 | validate=(not args.no_validate), 311 | timestamp=timestamp, 312 | meta=meta) 313 | 314 | 315 | if __name__ == '__main__': 316 | main() 317 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/vovnet.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Copyright (c) Youngwan Lee (ETRI) All Rights Reserved. 8 | # Copyright 2021 Toyota Research Institute. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | import warnings 11 | from collections import OrderedDict 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from mmcv.runner import BaseModule 17 | from mmdet.models.builder import BACKBONES 18 | from torch.nn.modules.batchnorm import _BatchNorm 19 | 20 | VoVNet19_slim_dw_eSE = { 21 | 'stem': [64, 64, 64], 22 | 'stage_conv_ch': [64, 80, 96, 112], 23 | 'stage_out_ch': [112, 256, 384, 512], 24 | "layer_per_block": 3, 25 | "block_per_stage": [1, 1, 1, 1], 26 | "eSE": True, 27 | "dw": True 28 | } 29 | 30 | VoVNet19_dw_eSE = { 31 | 'stem': [64, 64, 64], 32 | "stage_conv_ch": [128, 160, 192, 224], 33 | "stage_out_ch": [256, 512, 768, 1024], 34 | "layer_per_block": 3, 35 | "block_per_stage": [1, 1, 1, 1], 36 | "eSE": True, 37 | "dw": True 38 | } 39 | 40 | VoVNet19_slim_eSE = { 41 | 'stem': [64, 64, 128], 42 | 'stage_conv_ch': [64, 80, 96, 112], 43 | 'stage_out_ch': [112, 256, 384, 512], 44 | 'layer_per_block': 3, 45 | 'block_per_stage': [1, 1, 1, 1], 46 | 'eSE': True, 47 | "dw": False 48 | } 49 | 50 | VoVNet19_eSE = { 51 | 'stem': [64, 64, 128], 52 | "stage_conv_ch": [128, 160, 192, 224], 53 | "stage_out_ch": [256, 512, 768, 1024], 54 | "layer_per_block": 3, 55 | "block_per_stage": [1, 1, 1, 1], 56 | "eSE": True, 57 | "dw": False 58 | } 59 | 60 | VoVNet39_eSE = { 61 | 'stem': [64, 64, 128], 62 | "stage_conv_ch": [128, 160, 192, 224], 63 | "stage_out_ch": [256, 512, 768, 1024], 64 | "layer_per_block": 5, 65 | "block_per_stage": [1, 1, 2, 2], 66 | "eSE": True, 67 | "dw": False 68 | } 69 | 70 | VoVNet57_eSE = { 71 | 'stem': [64, 64, 128], 72 | "stage_conv_ch": [128, 160, 192, 224], 73 | "stage_out_ch": [256, 512, 768, 1024], 74 | "layer_per_block": 5, 75 | "block_per_stage": [1, 1, 4, 3], 76 | "eSE": True, 77 | "dw": False 78 | } 79 | 80 | VoVNet99_eSE = { 81 | 'stem': [64, 64, 128], 82 | "stage_conv_ch": [128, 160, 192, 224], 83 | "stage_out_ch": [256, 512, 768, 1024], 84 | "layer_per_block": 5, 85 | "block_per_stage": [1, 3, 9, 3], 86 | "eSE": True, 87 | "dw": False 88 | } 89 | 90 | _STAGE_SPECS = { 91 | "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, 92 | "V-19-dw-eSE": VoVNet19_dw_eSE, 93 | "V-19-slim-eSE": VoVNet19_slim_eSE, 94 | "V-19-eSE": VoVNet19_eSE, 95 | "V-39-eSE": VoVNet39_eSE, 96 | "V-57-eSE": VoVNet57_eSE, 97 | "V-99-eSE": VoVNet99_eSE, 98 | } 99 | 100 | 101 | def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): 102 | """3x3 convolution with padding""" 103 | return [ 104 | ( 105 | '{}_{}/dw_conv3x3'.format(module_name, postfix), 106 | nn.Conv2d( 107 | in_channels, 108 | out_channels, 109 | kernel_size=kernel_size, 110 | stride=stride, 111 | padding=padding, 112 | groups=out_channels, 113 | bias=False 114 | ) 115 | ), 116 | ( 117 | '{}_{}/pw_conv1x1'.format(module_name, postfix), 118 | nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) 119 | ), 120 | ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), 121 | ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), 122 | ] 123 | 124 | 125 | def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): 126 | """3x3 convolution with padding""" 127 | return [ 128 | ( 129 | f"{module_name}_{postfix}/conv", 130 | nn.Conv2d( 131 | in_channels, 132 | out_channels, 133 | kernel_size=kernel_size, 134 | stride=stride, 135 | padding=padding, 136 | groups=groups, 137 | bias=False, 138 | ), 139 | ), 140 | (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), 141 | (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), 142 | ] 143 | 144 | 145 | def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): 146 | """1x1 convolution with padding""" 147 | return [ 148 | ( 149 | f"{module_name}_{postfix}/conv", 150 | nn.Conv2d( 151 | in_channels, 152 | out_channels, 153 | kernel_size=kernel_size, 154 | stride=stride, 155 | padding=padding, 156 | groups=groups, 157 | bias=False, 158 | ), 159 | ), 160 | (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), 161 | (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), 162 | ] 163 | 164 | 165 | class Hsigmoid(nn.Module): 166 | def __init__(self, inplace=True): 167 | super(Hsigmoid, self).__init__() 168 | self.inplace = inplace 169 | 170 | def forward(self, x): 171 | return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 172 | 173 | 174 | class eSEModule(nn.Module): 175 | def __init__(self, channel, reduction=4): 176 | super(eSEModule, self).__init__() 177 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 178 | self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) 179 | self.hsigmoid = Hsigmoid() 180 | 181 | def forward(self, x): 182 | input = x 183 | x = self.avg_pool(x) 184 | x = self.fc(x) 185 | x = self.hsigmoid(x) 186 | return input * x 187 | 188 | 189 | class _OSA_module(nn.Module): 190 | def __init__( 191 | self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False 192 | ): 193 | 194 | super(_OSA_module, self).__init__() 195 | 196 | self.identity = identity 197 | self.depthwise = depthwise 198 | self.isReduced = False 199 | self.layers = nn.ModuleList() 200 | in_channel = in_ch 201 | if self.depthwise and in_channel != stage_ch: 202 | self.isReduced = True 203 | self.conv_reduction = nn.Sequential( 204 | OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) 205 | ) 206 | for i in range(layer_per_block): 207 | if self.depthwise: 208 | self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) 209 | else: 210 | self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) 211 | in_channel = stage_ch 212 | 213 | # feature aggregation 214 | in_channel = in_ch + layer_per_block * stage_ch 215 | self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) 216 | 217 | self.ese = eSEModule(concat_ch) 218 | 219 | def forward(self, x): 220 | 221 | identity_feat = x 222 | 223 | output = [] 224 | output.append(x) 225 | if self.depthwise and self.isReduced: 226 | x = self.conv_reduction(x) 227 | for layer in self.layers: 228 | x = layer(x) 229 | output.append(x) 230 | 231 | x = torch.cat(output, dim=1) 232 | xt = self.concat(x) 233 | 234 | xt = self.ese(xt) 235 | 236 | if self.identity: 237 | xt = xt + identity_feat 238 | 239 | return xt 240 | 241 | 242 | class _OSA_stage(nn.Sequential): 243 | def __init__( 244 | self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False 245 | ): 246 | 247 | super(_OSA_stage, self).__init__() 248 | 249 | if not stage_num == 2: 250 | self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) 251 | 252 | if block_per_stage != 1: 253 | SE = False 254 | module_name = f"OSA{stage_num}_1" 255 | self.add_module( 256 | module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) 257 | ) 258 | for i in range(block_per_stage - 1): 259 | if i != block_per_stage - 2: # last block 260 | SE = False 261 | module_name = f"OSA{stage_num}_{i + 2}" 262 | self.add_module( 263 | module_name, 264 | _OSA_module( 265 | concat_ch, 266 | stage_ch, 267 | concat_ch, 268 | layer_per_block, 269 | module_name, 270 | SE, 271 | identity=True, 272 | depthwise=depthwise 273 | ), 274 | ) 275 | 276 | 277 | @BACKBONES.register_module() 278 | class VoVNet(BaseModule): 279 | def __init__(self, spec_name, input_ch=3, out_features=None, 280 | frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): 281 | """ 282 | Args: 283 | input_ch(int) : the number of input channel 284 | out_features (list[str]): name of the layers whose outputs should 285 | be returned in forward. Can be anything in "stem", "stage2" ... 286 | """ 287 | super(VoVNet, self).__init__(init_cfg) 288 | self.fp16_enabled = False 289 | self.frozen_stages = frozen_stages 290 | self.norm_eval = norm_eval 291 | 292 | if isinstance(pretrained, str): 293 | warnings.warn('DeprecationWarning: pretrained is deprecated, ' 294 | 'please use "init_cfg" instead') 295 | self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) 296 | stage_specs = _STAGE_SPECS[spec_name] 297 | 298 | stem_ch = stage_specs["stem"] 299 | config_stage_ch = stage_specs["stage_conv_ch"] 300 | config_concat_ch = stage_specs["stage_out_ch"] 301 | block_per_stage = stage_specs["block_per_stage"] 302 | layer_per_block = stage_specs["layer_per_block"] 303 | SE = stage_specs["eSE"] 304 | depthwise = stage_specs["dw"] 305 | 306 | self._out_features = out_features 307 | 308 | # Stem module 309 | conv_type = dw_conv3x3 if depthwise else conv3x3 310 | stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) 311 | stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) 312 | stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) 313 | self.add_module("stem", nn.Sequential((OrderedDict(stem)))) 314 | current_stirde = 4 315 | self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} 316 | self._out_feature_channels = {"stem": stem_ch[2]} 317 | 318 | stem_out_ch = [stem_ch[2]] 319 | in_ch_list = stem_out_ch + config_concat_ch[:-1] 320 | # OSA stages 321 | self.stage_names = [] 322 | for i in range(4): # num_stages 323 | name = "stage%d" % (i + 2) # stage 2 ... stage 5 324 | self.stage_names.append(name) 325 | self.add_module( 326 | name, 327 | _OSA_stage( 328 | in_ch_list[i], 329 | config_stage_ch[i], 330 | config_concat_ch[i], 331 | block_per_stage[i], 332 | layer_per_block, 333 | i + 2, 334 | SE, 335 | depthwise, 336 | ), 337 | ) 338 | 339 | self._out_feature_channels[name] = config_concat_ch[i] 340 | if not i == 0: 341 | self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) 342 | 343 | # initialize weights 344 | # self._initialize_weights() 345 | 346 | def _initialize_weights(self): 347 | for m in self.modules(): 348 | if isinstance(m, nn.Conv2d): 349 | nn.init.kaiming_normal_(m.weight) 350 | 351 | def init_weights(self): 352 | super().init_weights() 353 | self._freeze_stages() 354 | 355 | def forward(self, x): 356 | outputs = {} 357 | x = self.stem(x) 358 | if "stem" in self._out_features: 359 | outputs["stem"] = x 360 | for name in self.stage_names: 361 | x = getattr(self, name)(x) 362 | if name in self._out_features: 363 | outputs[name] = x 364 | 365 | return outputs 366 | 367 | def _freeze_stages(self): 368 | if self.frozen_stages >= 0: 369 | m = getattr(self, 'stem') 370 | m.eval() 371 | for param in m.parameters(): 372 | param.requires_grad = False 373 | 374 | for i in range(1, self.frozen_stages + 1): 375 | m = getattr(self, f'stage{i + 1}') 376 | m.eval() 377 | for param in m.parameters(): 378 | param.requires_grad = False 379 | 380 | def train(self, mode=True): 381 | """Convert the model into training mode while keep normalization layer 382 | freezed.""" 383 | super(VoVNet, self).train(mode) 384 | # self._freeze_stages() 385 | if mode and self.norm_eval: 386 | for m in self.modules(): 387 | # trick: eval have effect on BatchNorm only 388 | if isinstance(m, _BatchNorm): 389 | m.eval() 390 | -------------------------------------------------------------------------------- /projects/configs/meformer_voxel0075_vov_1600x640_cbgs.py: -------------------------------------------------------------------------------- 1 | plugin = True 2 | plugin_dir = 'projects/mmdet3d_plugin/' 3 | 4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] 5 | class_names = [ 6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 8 | ] 9 | voxel_size = [0.075, 0.075, 0.2] 10 | out_size_factor = 8 11 | evaluation = dict(interval=1) 12 | dataset_type = 'CustomNuScenesDataset' 13 | data_root = 'data/nuscenes/' 14 | input_modality = dict( 15 | use_lidar=True, 16 | use_camera=True, 17 | use_radar=False, 18 | use_map=False, 19 | use_external=False) 20 | 21 | img_norm_cfg = dict( 22 | mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) 23 | 24 | ida_aug_conf = { 25 | "resize_lim": (0.94, 1.25), 26 | "final_dim": (640, 1600), 27 | "bot_pct_lim": (0.0, 0.0), 28 | "rot_lim": (0.0, 0.0), 29 | "H": 900, 30 | "W": 1600, 31 | "rand_flip": True, 32 | } 33 | 34 | train_pipeline = [ 35 | dict( 36 | type='LoadPointsFromFile', 37 | coord_type='LIDAR', 38 | load_dim=5, 39 | use_dim=[0, 1, 2, 3, 4], 40 | ), 41 | dict( 42 | type='LoadPointsFromMultiSweeps', 43 | sweeps_num=10, 44 | use_dim=[0, 1, 2, 3, 4], 45 | ), 46 | dict(type='LoadMultiViewImageFromFiles'), 47 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), 48 | dict( 49 | type='GlobalRotScaleTransAll', 50 | rot_range=[-0.3925 * 2, 0.3925 * 2], 51 | scale_ratio_range=[0.9, 1.1], 52 | translation_std=[0.5, 0.5, 0.5]), 53 | dict( 54 | type='CustomRandomFlip3D', 55 | sync_2d=False, 56 | flip_ratio_bev_horizontal=0.5, 57 | flip_ratio_bev_vertical=0.5), 58 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 59 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 60 | dict(type='ObjectNameFilter', classes=class_names), 61 | dict(type='PointShuffle'), 62 | dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True), 63 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 64 | dict(type='PadMultiViewImage', size_divisor=32), 65 | dict(type='DefaultFormatBundle3D', class_names=class_names), 66 | dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'], 67 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 68 | 'depth2img', 'cam2img', 'pad_shape', 69 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 70 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 71 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 72 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 73 | 'transformation_3d_flow', 'rot_degree', 74 | 'gt_bboxes_3d', 'gt_labels_3d')) 75 | ] 76 | test_pipeline = [ 77 | dict( 78 | type='LoadPointsFromFile', 79 | coord_type='LIDAR', 80 | load_dim=5, 81 | use_dim=[0, 1, 2, 3, 4], 82 | ), 83 | dict( 84 | type='LoadPointsFromMultiSweeps', 85 | sweeps_num=10, 86 | use_dim=[0, 1, 2, 3, 4], 87 | ), 88 | dict(type='LoadMultiViewImageFromFiles'), 89 | dict( 90 | type='MultiScaleFlipAug3D', 91 | img_scale=(1333, 800), 92 | pts_scale_ratio=1, 93 | flip=False, 94 | transforms=[ 95 | dict( 96 | type='GlobalRotScaleTrans', 97 | rot_range=[0, 0], 98 | scale_ratio_range=[1.0, 1.0], 99 | translation_std=[0, 0, 0]), 100 | dict(type='RandomFlip3D'), 101 | dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=False), 102 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 103 | dict(type='PadMultiViewImage', size_divisor=32), 104 | dict( 105 | type='DefaultFormatBundle3D', 106 | class_names=class_names, 107 | with_label=False), 108 | dict(type='Collect3D', keys=['points', 'img']) 109 | ]) 110 | ] 111 | data = dict( 112 | samples_per_gpu=4, 113 | workers_per_gpu=6, 114 | train=dict( 115 | type='CBGSDataset', 116 | dataset=dict( 117 | type=dataset_type, 118 | data_root=data_root, 119 | ann_file=data_root + '/nuscenes_infos_train.pkl', 120 | load_interval=1, 121 | pipeline=train_pipeline, 122 | classes=class_names, 123 | modality=input_modality, 124 | test_mode=False, 125 | box_type_3d='LiDAR')), 126 | val=dict( 127 | type=dataset_type, 128 | data_root=data_root, 129 | ann_file=data_root + '/nuscenes_infos_val.pkl', 130 | load_interval=1, 131 | pipeline=test_pipeline, 132 | classes=class_names, 133 | modality=input_modality, 134 | test_mode=True, 135 | box_type_3d='LiDAR'), 136 | test=dict( 137 | type=dataset_type, 138 | data_root=data_root, 139 | ann_file=data_root + '/nuscenes_infos_val.pkl', 140 | load_interval=1, 141 | pipeline=test_pipeline, 142 | classes=class_names, 143 | modality=input_modality, 144 | test_mode=True, 145 | box_type_3d='LiDAR')) 146 | model = dict( 147 | type='MEFormerDetector', 148 | use_grid_mask=True, 149 | img_backbone=dict( 150 | type='VoVNet', 151 | spec_name='V-99-eSE', 152 | norm_eval=True, 153 | frozen_stages=-1, 154 | input_ch=3, 155 | out_features=('stage4', 'stage5',)), 156 | img_neck=dict( 157 | type='CPFPN', 158 | in_channels=[768, 1024], 159 | out_channels=256, 160 | num_outs=2), 161 | pts_voxel_layer=dict( 162 | num_point_features=5, 163 | max_num_points=10, 164 | voxel_size=voxel_size, 165 | max_voxels=(120000, 160000), 166 | point_cloud_range=point_cloud_range), 167 | pts_voxel_encoder=dict( 168 | type='HardSimpleVFE', 169 | num_features=5, 170 | ), 171 | pts_middle_encoder=dict( 172 | type='SparseEncoder', 173 | in_channels=5, 174 | sparse_shape=[41, 1440, 1440], 175 | output_channels=128, 176 | order=('conv', 'norm', 'act'), 177 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 178 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 179 | block_type='basicblock'), 180 | pts_backbone=dict( 181 | type='SECOND', 182 | in_channels=256, 183 | out_channels=[128, 256], 184 | layer_nums=[5, 5], 185 | layer_strides=[1, 2], 186 | norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), 187 | conv_cfg=dict(type='Conv2d', bias=False)), 188 | pts_neck=dict( 189 | type='SECONDFPN', 190 | in_channels=[128, 256], 191 | out_channels=[256, 256], 192 | upsample_strides=[1, 2], 193 | norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), 194 | upsample_cfg=dict(type='deconv', bias=False), 195 | use_conv_for_no_stride=True), 196 | pts_bbox_head=dict( 197 | type='MEFormerHead', 198 | in_channels=512, 199 | hidden_dim=256, 200 | downsample_scale=8, 201 | pc_range=point_cloud_range, 202 | use_ensemble=True, 203 | modalities=dict( 204 | train=["fused", "bev", "img"], 205 | test=["fused", "bev", "img"] 206 | ), 207 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), 208 | tasks=[ 209 | dict(num_class=10, class_names=[ 210 | 'car', 'truck', 'construction_vehicle', 211 | 'bus', 'trailer', 'barrier', 212 | 'motorcycle', 'bicycle', 213 | 'pedestrian', 'traffic_cone' 214 | ]), 215 | ], 216 | bbox_coder=dict( 217 | type='MultiTaskBBoxCoder', 218 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 219 | pc_range=point_cloud_range, 220 | max_num=300, 221 | voxel_size=voxel_size, 222 | num_classes=10), 223 | separate_head=dict( 224 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=1), 225 | transformer=dict( 226 | type='MOADTransformer', 227 | use_cam_embed=True, 228 | decoder=dict( 229 | type='PETRTransformerDecoder', 230 | return_intermediate=True, 231 | num_layers=6, 232 | transformerlayers=dict( 233 | type='PETRTransformerDecoderLayer', 234 | with_cp=False, 235 | attn_cfgs=[ 236 | dict( 237 | type='MultiheadAttention', 238 | embed_dims=256, 239 | num_heads=8, 240 | dropout=0.1), 241 | dict( 242 | type='PETRMultiheadFlashAttention', 243 | embed_dims=256, 244 | num_heads=8, 245 | dropout=0.1), 246 | ], 247 | ffn_cfgs=dict( 248 | type='FFN', 249 | embed_dims=256, 250 | feedforward_channels=1024, 251 | num_fcs=2, 252 | ffn_drop=0., 253 | act_cfg=dict(type='ReLU', inplace=True), 254 | ), 255 | 256 | feedforward_channels=1024, 257 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 258 | 'ffn', 'norm')), 259 | ), 260 | ), 261 | ensemble=dict( 262 | type='PMETransformer', 263 | decoder=dict( 264 | type="PETRTransformerDecoder", 265 | return_intermediate=True, 266 | num_layers=1, 267 | transformerlayers=dict( 268 | type='PETRTransformerDecoderLayer', 269 | with_cp=False, 270 | attn_cfgs=[ 271 | dict( 272 | type='MultiheadAttention', 273 | embed_dims=256, 274 | num_heads=8, 275 | dropout=0.1), 276 | ], 277 | ffn_cfgs=dict( 278 | type='FFN', 279 | embed_dims=256, 280 | feedforward_channels=1024, 281 | num_fcs=2, 282 | ffn_drop=0., 283 | act_cfg=dict(type='ReLU', inplace=True), 284 | ), 285 | 286 | feedforward_channels=1024, 287 | operation_order=('cross_attn', 'norm', 'ffn', 'norm') 288 | ), 289 | ), 290 | ), 291 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0), 292 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25) 293 | ), 294 | train_cfg=dict( 295 | pts=dict( 296 | dataset='nuScenes', 297 | assigner=dict( 298 | type='HungarianAssigner3D', 299 | cls_cost=dict(type='FocalLossCost', weight=2.0), 300 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 301 | iou_cost=dict(type='IoUCost', weight=0.0), 302 | pc_range=point_cloud_range, 303 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 304 | ), 305 | pos_weight=-1, 306 | gaussian_overlap=0.1, 307 | min_radius=2, 308 | grid_size=[1440, 1440, 40], 309 | voxel_size=voxel_size, 310 | out_size_factor=out_size_factor, 311 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 312 | point_cloud_range=point_cloud_range)), 313 | test_cfg=dict( 314 | pts=dict( 315 | dataset='nuScenes', 316 | grid_size=[1440, 1440, 40], 317 | out_size_factor=out_size_factor, 318 | pc_range=point_cloud_range, 319 | voxel_size=voxel_size, 320 | nms_type=None, 321 | nms_thr=0.2, 322 | use_rotate_nms=True, 323 | max_num=200 324 | ))) 325 | optimizer = dict( 326 | type='AdamW', 327 | lr=0.0001, 328 | paramwise_cfg=dict( 329 | custom_keys={ 330 | 'img_backbone': dict(lr_mult=0.01, decay_mult=5), 331 | 'img_neck': dict(lr_mult=0.1), 332 | }), 333 | weight_decay=0.01) 334 | optimizer_config = dict( 335 | type='CustomFp16OptimizerHook', 336 | loss_scale='dynamic', 337 | grad_clip=dict(max_norm=35, norm_type=2), 338 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False)) 339 | lr_config = dict( 340 | policy='CosineAnnealing', 341 | by_epoch=False, 342 | min_lr_ratio=0.001, 343 | warmup="linear", 344 | warmup_iters=1000) 345 | momentum_config = dict( 346 | policy='cyclic', 347 | target_ratio=(0.8947368421052632, 1), 348 | cyclic_times=1, 349 | step_ratio_up=0.4) 350 | total_epochs = 6 351 | checkpoint_config = dict(interval=1) 352 | log_config = dict( 353 | interval=50, 354 | hooks=[dict(type='TextLoggerHook'), 355 | dict(type='TensorboardLoggerHook')]) 356 | dist_params = dict(backend='nccl') 357 | log_level = 'INFO' 358 | work_dir = None 359 | load_from = 'ckpts/moad_voxel0075_vov_1600x640_cbgs.pth' 360 | resume_from = None 361 | workflow = [('train', 1)] 362 | gpu_ids = range(0, 8) 363 | 364 | custom_hooks = [ 365 | dict( 366 | type="FreezeWeight", 367 | finetune_weight=["pts_bbox_head.ensemble"] 368 | ) 369 | ] 370 | 371 | find_unused_parameters = True 372 | -------------------------------------------------------------------------------- /projects/configs/mome/mome.py: -------------------------------------------------------------------------------- 1 | plugin = True 2 | plugin_dir = 'projects/mmdet3d_plugin/' 3 | 4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] 5 | class_names = [ 6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 8 | ] 9 | voxel_size = [0.075, 0.075, 0.2] 10 | out_size_factor = 8 11 | evaluation = dict(interval=1) 12 | dataset_type = 'CustomNuScenesDataset' 13 | data_root = 'data/nuscenes/' 14 | input_modality = dict( 15 | use_lidar=True, 16 | use_camera=True, 17 | use_radar=False, 18 | use_map=False, 19 | use_external=False) 20 | 21 | img_norm_cfg = dict( 22 | mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) 23 | 24 | ida_aug_conf = { 25 | "resize_lim": (0.94, 1.25), 26 | "final_dim": (640, 1600), 27 | "bot_pct_lim": (0.0, 0.0), 28 | "rot_lim": (0.0, 0.0), 29 | "H": 900, 30 | "W": 1600, 31 | "rand_flip": True, 32 | } 33 | 34 | train_pipeline = [ 35 | dict( 36 | type='LoadPointsFromFile', 37 | coord_type='LIDAR', 38 | load_dim=5, 39 | use_dim=[0, 1, 2, 3, 4], 40 | ), 41 | dict( 42 | type='LoadPointsFromMultiSweeps', 43 | sweeps_num=10, 44 | use_dim=[0, 1, 2, 3, 4], 45 | ), 46 | dict(type='LoadMultiViewImageFromFiles'), 47 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), 48 | dict(type='ModalMask3D', 49 | mode='train'), 50 | dict( 51 | type='GlobalRotScaleTransAll', 52 | rot_range=[-0.3925 * 2, 0.3925 * 2], 53 | scale_ratio_range=[0.9, 1.1], 54 | translation_std=[0.5, 0.5, 0.5]), 55 | dict( 56 | type='CustomRandomFlip3D', 57 | sync_2d=False, 58 | flip_ratio_bev_horizontal=0.5, 59 | flip_ratio_bev_vertical=0.5), 60 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 61 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 62 | dict(type='ObjectNameFilter', classes=class_names), 63 | dict(type='PointShuffle'), 64 | dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True), 65 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 66 | dict(type='PadMultiViewImage', size_divisor=32), 67 | dict(type='DefaultFormatBundle3D', class_names=class_names), 68 | dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'], 69 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 70 | 'depth2img', 'cam2img', 'pad_shape', 71 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 72 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 73 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 74 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 75 | 'transformation_3d_flow', 'rot_degree', 76 | 'gt_bboxes_3d', 'gt_labels_3d', 'modalmask')) 77 | ] 78 | test_pipeline = [ 79 | dict( 80 | type='LoadPointsFromFile', 81 | coord_type='LIDAR', 82 | load_dim=5, 83 | use_dim=[0, 1, 2, 3, 4], 84 | ), 85 | dict( 86 | type='LoadPointsFromMultiSweeps', 87 | sweeps_num=10, 88 | use_dim=[0, 1, 2, 3, 4], 89 | ), 90 | dict(type='LoadMultiViewImageFromFiles'), 91 | dict( 92 | type='MultiScaleFlipAug3D', 93 | img_scale=(1333, 800), 94 | pts_scale_ratio=1, 95 | flip=False, 96 | transforms=[ 97 | dict( 98 | type='GlobalRotScaleTrans', 99 | rot_range=[0, 0], 100 | scale_ratio_range=[1.0, 1.0], 101 | translation_std=[0, 0, 0]), 102 | dict(type='RandomFlip3D'), 103 | dict(type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=False), 104 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 105 | dict(type='PadMultiViewImage', size_divisor=32), 106 | dict( 107 | type='DefaultFormatBundle3D', 108 | class_names=class_names, 109 | with_label=False), 110 | dict(type='Collect3D', keys=['points', 'img']) 111 | ]) 112 | ] 113 | data = dict( 114 | samples_per_gpu=2, 115 | workers_per_gpu=6, 116 | train=dict( 117 | type='CBGSDataset', 118 | dataset=dict( 119 | type=dataset_type, 120 | data_root=data_root, 121 | ann_file=data_root + '/nuscenes_infos_train.pkl', 122 | load_interval=1, 123 | pipeline=train_pipeline, 124 | classes=class_names, 125 | modality=input_modality, 126 | test_mode=False, 127 | box_type_3d='LiDAR')), 128 | val=dict( 129 | type=dataset_type, 130 | data_root=data_root, 131 | ann_file=data_root + '/nuscenes_infos_val.pkl', 132 | load_interval=1, 133 | pipeline=test_pipeline, 134 | classes=class_names, 135 | modality=input_modality, 136 | test_mode=True, 137 | box_type_3d='LiDAR'), 138 | test=dict( 139 | type=dataset_type, 140 | data_root=data_root, 141 | ann_file=data_root + '/nuscenes_infos_val.pkl', 142 | load_interval=1, 143 | pipeline=test_pipeline, 144 | classes=class_names, 145 | modality=input_modality, 146 | test_mode=True, 147 | box_type_3d='LiDAR')) 148 | model = dict( 149 | type='MoME', 150 | use_grid_mask=True, 151 | img_backbone=dict( 152 | type='VoVNet', 153 | spec_name='V-99-eSE', 154 | norm_eval=True, 155 | frozen_stages=-1, 156 | input_ch=3, 157 | out_features=('stage4', 'stage5',)), 158 | img_neck=dict( 159 | type='CPFPN', 160 | in_channels=[768, 1024], 161 | out_channels=256, 162 | num_outs=2), 163 | pts_voxel_layer=dict( 164 | num_point_features=5, 165 | max_num_points=10, 166 | voxel_size=voxel_size, 167 | max_voxels=(120000, 160000), 168 | point_cloud_range=point_cloud_range), 169 | pts_voxel_encoder=dict( 170 | type='HardSimpleVFE', 171 | num_features=5, 172 | ), 173 | pts_middle_encoder=dict( 174 | type='SparseEncoder', 175 | in_channels=5, 176 | sparse_shape=[41, 1440, 1440], 177 | output_channels=128, 178 | order=('conv', 'norm', 'act'), 179 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 180 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 181 | block_type='basicblock'), 182 | pts_backbone=dict( 183 | type='SECOND', 184 | in_channels=256, 185 | out_channels=[128, 256], 186 | layer_nums=[5, 5], 187 | layer_strides=[1, 2], 188 | norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), 189 | conv_cfg=dict(type='Conv2d', bias=False)), 190 | pts_neck=dict( 191 | type='SECONDFPN', 192 | in_channels=[128, 256], 193 | out_channels=[256, 256], 194 | upsample_strides=[1, 2], 195 | norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), 196 | upsample_cfg=dict(type='deconv', bias=False), 197 | use_conv_for_no_stride=True), 198 | pts_bbox_head=dict( 199 | type='MultiExpertDecoding', 200 | in_channels=512, 201 | hidden_dim=256, 202 | downsample_scale=8, 203 | pc_range=point_cloud_range, 204 | modalities=dict( 205 | train=["fused", "bev", "img"], 206 | test=["fused", "bev", "img"] 207 | ), 208 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), 209 | tasks=[ 210 | dict(num_class=10, class_names=[ 211 | 'car', 'truck', 'construction_vehicle', 212 | 'bus', 'trailer', 'barrier', 213 | 'motorcycle', 'bicycle', 214 | 'pedestrian', 'traffic_cone' 215 | ]), 216 | ], 217 | bbox_coder=dict( 218 | type='MultiTaskBBoxCoder', 219 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 220 | pc_range=point_cloud_range, 221 | max_num=300, 222 | voxel_size=voxel_size, 223 | num_classes=10), 224 | separate_head=dict( 225 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=1), 226 | transformer=dict( 227 | type='MultiExpert', 228 | use_cam_embed=True, 229 | window_sizes=[15,5], 230 | encoder=dict( 231 | type="PETRTransformerDecoder", 232 | return_intermediate=True, 233 | num_layers=1, # same with len(ensemble.modal_seq) 234 | transformerlayers=dict( 235 | type='PETRTransformerDecoderLayer', 236 | with_cp=False, 237 | attn_cfgs=[ 238 | dict( 239 | type='MultiheadAttention', 240 | embed_dims=256, 241 | num_heads=4, 242 | dropout=0.1), 243 | ], 244 | ffn_cfgs=dict( 245 | type='FFN', 246 | embed_dims=256, 247 | feedforward_channels=1024, 248 | num_fcs=2, 249 | ffn_drop=0., 250 | act_cfg=dict(type='ReLU', inplace=True), 251 | ), 252 | 253 | feedforward_channels=1024, 254 | operation_order=('cross_attn', 'norm', 'ffn', 'norm') 255 | ), 256 | ), 257 | decoder=dict( 258 | type='PETRTransformerDecoder', 259 | return_intermediate=True, 260 | num_layers=6, 261 | transformerlayers=dict( 262 | type='PETRTransformerDecoderLayer', 263 | with_cp=False, 264 | attn_cfgs=[ 265 | dict( 266 | type='MultiheadAttention', 267 | embed_dims=256, 268 | num_heads=8, 269 | dropout=0.1), 270 | dict( 271 | type='PETRMultiheadFlashAttention', 272 | embed_dims=256, 273 | num_heads=8, 274 | dropout=0.1), 275 | ], 276 | ffn_cfgs=dict( 277 | type='FFN', 278 | embed_dims=256, 279 | feedforward_channels=1024, 280 | num_fcs=2, 281 | ffn_drop=0., 282 | act_cfg=dict(type='ReLU', inplace=True), 283 | ), 284 | 285 | feedforward_channels=1024, 286 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 287 | 'ffn', 'norm')), 288 | ), 289 | ), 290 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0), 291 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25) 292 | ), 293 | train_cfg=dict( 294 | pts=dict( 295 | dataset='nuScenes', 296 | assigner=dict( 297 | type='HungarianAssigner3D', 298 | cls_cost=dict(type='FocalLossCost', weight=2.0), 299 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 300 | iou_cost=dict(type='IoUCost', weight=0.0), 301 | pc_range=point_cloud_range, 302 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 303 | ), 304 | pos_weight=-1, 305 | gaussian_overlap=0.1, 306 | min_radius=2, 307 | grid_size=[1440, 1440, 40], 308 | voxel_size=voxel_size, 309 | out_size_factor=out_size_factor, 310 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 311 | point_cloud_range=point_cloud_range)), 312 | test_cfg=dict( 313 | pts=dict( 314 | dataset='nuScenes', 315 | grid_size=[1440, 1440, 40], 316 | out_size_factor=out_size_factor, 317 | pc_range=point_cloud_range, 318 | voxel_size=voxel_size, 319 | nms_type=None, 320 | nms_thr=0.2, 321 | use_rotate_nms=True, 322 | max_num=200 323 | ))) 324 | optimizer = dict( 325 | type='AdamW', 326 | lr=0.0001, 327 | paramwise_cfg=dict( 328 | custom_keys={ 329 | 'img_backbone': dict(lr_mult=0.01, decay_mult=5), 330 | 'img_neck': dict(lr_mult=0.1), 331 | }), 332 | weight_decay=0.01) 333 | optimizer_config = dict( 334 | type='CustomFp16OptimizerHook', 335 | loss_scale='dynamic', 336 | grad_clip=dict(max_norm=35, norm_type=2), 337 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False)) 338 | lr_config = dict( 339 | policy='CosineAnnealing', 340 | by_epoch=False, 341 | min_lr_ratio=0.001, 342 | warmup="linear", 343 | warmup_iters=1000) 344 | momentum_config = dict( 345 | policy='cyclic', 346 | target_ratio=(0.8947368421052632, 1), 347 | cyclic_times=1, 348 | step_ratio_up=0.4) 349 | total_epochs = 6 350 | checkpoint_config = dict(interval=1) 351 | log_config = dict( 352 | interval=50, 353 | hooks=[dict(type='TextLoggerHook'), 354 | dict(type='TensorboardLoggerHook')]) 355 | dist_params = dict(backend='nccl') 356 | log_level = 'INFO' 357 | work_dir = None 358 | load_from = 'ckpts/moad_voxel0075_vov_1600x640_cbgs.pth' 359 | resume_from = None 360 | workflow = [('train', 1)] 361 | gpu_ids = range(0, 8) 362 | 363 | custom_hooks = [ 364 | dict( 365 | type="FreezeWeight", 366 | finetune_weight=["pts_bbox_head.transformer.encoder", "pts_bbox_head.transformer.selected_cls"] 367 | ) 368 | ] 369 | 370 | find_unused_parameters = True --------------------------------------------------------------------------------