├── projects ├── mmdet3d_plugin │ ├── core │ │ ├── __init__.py │ │ └── bbox │ │ │ ├── coders │ │ │ ├── __init__.py │ │ │ └── multi_task_bbox_coder.py │ │ │ ├── assigners │ │ │ ├── __init__.py │ │ │ └── hungarian_assigner_3d.py │ │ │ ├── match_costs │ │ │ ├── __init__.py │ │ │ └── match_cost.py │ │ │ └── util.py │ ├── mmcv_custom │ │ ├── ops │ │ │ ├── __init__.py │ │ │ └── voxel │ │ │ │ ├── __init__.py │ │ │ │ └── spconv_voxelize.py │ │ ├── runner │ │ │ ├── __init__.py │ │ │ └── hooks │ │ │ │ ├── __init__.py │ │ │ │ └── optimizer.py │ │ └── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ └── custom_nuscenes_dataset.py │ ├── models │ │ ├── detectors │ │ │ ├── __init__.py │ │ │ └── fstr.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── cmt_transformer.py │ │ │ └── petr_transformer.py │ │ ├── backbones │ │ │ ├── __init__.py │ │ │ └── voxelnext.py │ │ ├── __init__.py │ │ └── dense_heads │ │ │ ├── __init__.py │ │ │ └── fstr_head.py │ └── __init__.py └── configs │ └── lidar │ ├── fstr_voxel0075_cbgs_20e.py │ ├── fstr_large_voxel0075_cbgs_20e.py │ └── fstr_xlarge_voxel0050_cbgs_20e.py ├── tools ├── dist_train.sh ├── dist_test.sh ├── test.py └── train.py ├── .gitignore ├── README.md └── LICENSE /projects/mmdet3d_plugin/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .voxel import * -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/__init__.py: -------------------------------------------------------------------------------- 1 | from .hooks import * -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | from .runner import * 2 | from .ops import * -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .custom_nuscenes_dataset import CustomNuScenesDataset -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/voxel/__init__.py: -------------------------------------------------------------------------------- 1 | from .spconv_voxelize import SPConvVoxelization -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimizer import CustomFp16OptimizerHook -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .fstr import FSTRDetector 2 | __all__ = ['FSTRDetector'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .cmt_transformer import * 2 | from .petr_transformer import * 3 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .voxelnext import VoxelNextEncoder 2 | __all__ = ['VoxelNextEncoder'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/__init__.py: -------------------------------------------------------------------------------- 1 | from .multi_task_bbox_coder import MultiTaskBBoxCoder 2 | 3 | __all__ = ['MultiTaskBBoxCoder'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/__init__.py: -------------------------------------------------------------------------------- 1 | from .hungarian_assigner_3d import HungarianAssigner3D 2 | 3 | __all__ = ['HungarianAssigner3D'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * 2 | from .detectors import * 3 | from .dense_heads import * 4 | from .utils import * 5 | 6 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .fstr_head import ( 2 | FSTRHead, 3 | SeparateTaskHead, 4 | ) 5 | 6 | __all__ = ['SeparateTaskHead', 'FSTRHead'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from .core.bbox.assigners import * 2 | from .core.bbox.coders import * 3 | from .core.bbox.match_costs import BBox3DL1Cost 4 | from .datasets import * 5 | from .mmcv_custom import * 6 | from .models import * -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py: -------------------------------------------------------------------------------- 1 | from mmdet.core.bbox.match_costs import build_match_cost 2 | from .match_cost import BBox3DL1Cost, BBoxBEVL1Cost, IoU3DCost 3 | 4 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'BBoxBEVL1Cost', 'IoU3DCost'] 5 | -------------------------------------------------------------------------------- /tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --seed 0 \ 20 | --launcher pytorch ${@:3} 21 | -------------------------------------------------------------------------------- /tools/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-29500} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 12 | python -m torch.distributed.launch \ 13 | --nnodes=$NNODES \ 14 | --node_rank=$NODE_RANK \ 15 | --master_addr=$MASTER_ADDR \ 16 | --nproc_per_node=$GPUS \ 17 | --master_port=$PORT \ 18 | $(dirname "$0")/test.py \ 19 | $CONFIG \ 20 | $CHECKPOINT \ 21 | --launcher pytorch \ 22 | ${@:4} 23 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/runner/hooks/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from mmcv.runner.hooks.optimizer import Fp16OptimizerHook 6 | from mmcv.runner.hooks import HOOKS 7 | 8 | 9 | @HOOKS.register_module() 10 | class CustomFp16OptimizerHook(Fp16OptimizerHook): 11 | 12 | def __init__(self, 13 | custom_fp16={}, 14 | *args, 15 | **kwargs): 16 | super(CustomFp16OptimizerHook, self).__init__(*args, **kwargs) 17 | self.custom_fp16 = custom_fp16 18 | 19 | def before_run(self, runner) -> None: 20 | super().before_run(runner) 21 | for module_name, v in self.custom_fp16.items(): 22 | runner.model.module._modules[module_name].fp16_enabled = v 23 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST 3 | 4 | 5 | @MATCH_COST.register_module() 6 | class BBox3DL1Cost(object): 7 | """BBox3DL1Cost. 8 | Args: 9 | weight (int | float, optional): loss_weight 10 | """ 11 | 12 | def __init__(self, weight=1.): 13 | self.weight = weight 14 | 15 | def __call__(self, bbox_pred, gt_bboxes): 16 | """ 17 | Args: 18 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 19 | (cx, cy, w, h), which are all in range [0, 1]. Shape 20 | [num_query, 4]. 21 | gt_bboxes (Tensor): Ground truth boxes with normalized 22 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 23 | Returns: 24 | torch.Tensor: bbox_cost value with weight 25 | """ 26 | bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) 27 | return bbox_cost * self.weight 28 | 29 | 30 | @MATCH_COST.register_module() 31 | class BBoxBEVL1Cost(object): 32 | def __init__(self, weight): 33 | self.weight = weight 34 | 35 | def __call__(self, bboxes, gt_bboxes, pc_range): 36 | pc_start = bboxes.new(pc_range[0:2]) 37 | pc_range = bboxes.new(pc_range[3:5]) - bboxes.new(pc_range[0:2]) 38 | # normalize the box center to [0, 1] 39 | normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range 40 | normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range 41 | reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1) 42 | return reg_cost * self.weight 43 | 44 | 45 | @MATCH_COST.register_module() 46 | class IoU3DCost(object): 47 | def __init__(self, weight): 48 | self.weight = weight 49 | 50 | def __call__(self, iou): 51 | iou_cost = - iou 52 | return iou_cost * self.weight -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.ipynb 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | tmp/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | .figs 30 | 31 | mmdetection3d/ 32 | mmdetection3d 33 | mmdet3d 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | hostfile.txt 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .env 94 | .venv 95 | env/ 96 | venv/ 97 | ENV/ 98 | env.bak/ 99 | venv.bak/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | # cython generated cpp 115 | data 116 | ckpts 117 | .vscode 118 | .idea 119 | 120 | # custom 121 | nuscenes_gt_database 122 | nuscenes_unified_gt_database 123 | work_dirs 124 | *.pkl 125 | *.pkl.json 126 | *.log.json 127 | work_dirs/ 128 | exps/ 129 | *~ 130 | mmdet3d/.mim 131 | 132 | # Pytorch 133 | *.pth 134 | 135 | 136 | # demo 137 | figs 138 | 139 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import numpy as np 4 | import mmdet3d 5 | from mmdet3d.core import limit_period 6 | 7 | 8 | def normalize_bbox(bboxes, pc_range=None): 9 | 10 | cx = bboxes[..., 0:1] 11 | cy = bboxes[..., 1:2] 12 | cz = bboxes[..., 2:3] 13 | w = bboxes[..., 3:4].log() 14 | l = bboxes[..., 4:5].log() 15 | h = bboxes[..., 5:6].log() 16 | 17 | rot = bboxes[..., 6:7] 18 | if bboxes.size(-1) > 7: 19 | vx = bboxes[..., 7:8] 20 | vy = bboxes[..., 8:9] 21 | normalized_bboxes = torch.cat( 22 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 23 | ) 24 | else: 25 | normalized_bboxes = torch.cat( 26 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 27 | ) 28 | return normalized_bboxes 29 | 30 | 31 | def denormalize_bbox(normalized_bboxes, pc_range=None): 32 | # rotation 33 | rot_sine = normalized_bboxes[..., 6:7] 34 | 35 | rot_cosine = normalized_bboxes[..., 7:8] 36 | rot = torch.atan2(rot_sine, rot_cosine) 37 | 38 | # center in the bev 39 | cx = normalized_bboxes[..., 0:1] 40 | cy = normalized_bboxes[..., 1:2] 41 | cz = normalized_bboxes[..., 4:5] 42 | 43 | # size 44 | w = normalized_bboxes[..., 2:3] 45 | l = normalized_bboxes[..., 3:4] 46 | h = normalized_bboxes[..., 5:6] 47 | 48 | w = w.exp() 49 | l = l.exp() 50 | h = h.exp() 51 | 52 | if normalized_bboxes.size(-1) > 8: 53 | # velocity 54 | vx = normalized_bboxes[..., 8:9] 55 | vy = normalized_bboxes[..., 9:10] 56 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) 57 | else: 58 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) 59 | return denormalized_bboxes 60 | 61 | 62 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical): 63 | """Map bboxes from testing scale to original image scale. 64 | 65 | Args: 66 | bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. 67 | scale_factor (float): Scale factor. 68 | flip_horizontal (bool): Whether to flip horizontally. 69 | flip_vertical (bool): Whether to flip vertically. 70 | 71 | Returns: 72 | :obj:`BaseInstance3DBoxes`: Boxes mapped back. 73 | """ 74 | new_bboxes = bboxes.clone() 75 | if flip_horizontal: 76 | new_bboxes.flip('horizontal') 77 | if flip_vertical: 78 | new_bboxes.flip('vertical') 79 | new_bboxes.scale(1 / scale_factor) 80 | new_bboxes.rotate(-rot_degree) 81 | 82 | return new_bboxes -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/mmcv_custom/ops/voxel/spconv_voxelize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 2 | 3 | import numpy as np 4 | from torch import nn 5 | from spconv.pytorch.utils import PointToVoxel # spconv-cu111 2.1.21 6 | import torch 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import _pair 9 | 10 | 11 | class SPConvVoxelization(nn.Module): 12 | def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels, num_point_features, device=torch.device("cuda")): 13 | super().__init__() 14 | assert len(voxel_size) == 3 15 | assert len(point_cloud_range) == 6 16 | self.voxel_size = np.array(voxel_size) 17 | self.point_cloud_range = np.array(point_cloud_range) 18 | self.max_num_points = max_num_points 19 | self.num_point_features = num_point_features 20 | self.device = device 21 | if isinstance(max_voxels, tuple): 22 | self.max_voxels = max_voxels 23 | else: 24 | self.max_voxels = _pair(max_voxels) 25 | self.voxel_generator = PointToVoxel( 26 | vsize_xyz=voxel_size, 27 | coors_range_xyz=point_cloud_range, 28 | max_num_points_per_voxel=max_num_points, 29 | max_num_voxels=self.max_voxels[0], 30 | num_point_features=num_point_features, 31 | device=device, 32 | ) 33 | grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(voxel_size) 34 | self.grid_size = np.round(grid_size).astype(np.int64) 35 | 36 | def train(self, mode: bool = True): 37 | if mode: 38 | self.voxel_generator = PointToVoxel( 39 | vsize_xyz=self.voxel_size.tolist(), 40 | coors_range_xyz=self.point_cloud_range.tolist(), 41 | max_num_points_per_voxel=self.max_num_points, 42 | max_num_voxels=self.max_voxels[0], 43 | num_point_features=self.num_point_features, 44 | device=self.device, 45 | ) 46 | else: 47 | self.voxel_generator = PointToVoxel( 48 | vsize_xyz=self.voxel_size.tolist(), 49 | coors_range_xyz=self.point_cloud_range.tolist(), 50 | max_num_points_per_voxel=self.max_num_points, 51 | max_num_voxels=self.max_voxels[1], 52 | num_point_features=self.num_point_features, 53 | device=self.device, 54 | ) 55 | 56 | return super().train(mode) 57 | 58 | def forward(self, points): 59 | voxel_output = self.voxel_generator(points) 60 | voxels, coordinates, num_points = voxel_output 61 | return torch.clone(voxels), torch.clone(coordinates), torch.clone(num_points) 62 | 63 | def __repr__(self): 64 | tmpstr = self.__class__.__name__ + '(' 65 | tmpstr += 'voxel_size=' + str(self.voxel_size) 66 | tmpstr += ', point_cloud_range=' + str(self.point_cloud_range) 67 | tmpstr += ', max_num_points=' + str(self.max_num_points) 68 | tmpstr += ', max_voxels=' + str(self.max_voxels) 69 | tmpstr += ', num_point_features=' + str(self.num_point_features) 70 | tmpstr += ')' 71 | return tmpstr 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Fully Sparse Transformer 3D Detector for LiDAR Point Cloud 3 | 4 | [Paper](https://ieeexplore.ieee.org/document/10302363), [nuScenes LeaderBoard](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Lidar) 5 | 6 | 7 | 8 | All statistics are measured on a single Tesla A100 GPU using the best model of official repositories. Some sparse module in the model are supported. 9 |
10 | 11 | FSTR is a fully sparse LiDAR-based detector that achieves better accuracy-efficient trade-off compare with other popular LiDAR-based detectors. A lightweight DETR-like framework with signle decoder layer is designed for lidar-only detection, which obtains **73.6%** NDS (**FSTR-XLarge with TTA**) on nuScenes benchmark and **31.5%** CDS (**FSTR-Large**) on Argoverse2 validation dataset. 12 | 13 | ## Currently Supported Features 14 | - [x] Support nuScenes dataset 15 | - [ ] Support Argoverse2 dataset 16 | ## Preparation 17 | 18 | * Environments 19 | Python == 3.8 \ 20 | CUDA == 11.1 \ 21 | pytorch == 1.9.0 \ 22 | mmcv-full == 1.6.0 \ 23 | mmdet == 2.24.0 \ 24 | mmsegmentation == 0.29.1 \ 25 | mmdet3d == 1.0.0rc5 \ 26 | [flash-attn](https://github.com/HazyResearch/flash-attention) == 0.2.2 \ 27 | [Spconv-plus](https://github.com/dvlab-research/spconv-plus) == 2.1.21 28 | 29 | * Data 30 | Follow the [mmdet3d](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/data_preparation.md) to process the nuScenes dataset. 31 | 32 | ## Train & inference 33 | ```bash 34 | # train 35 | bash tools/dist_train.sh /path_to_your_config 8 36 | # inference 37 | bash tools/dist_test.sh /path_to_your_config /path_to_your_pth 8 --eval bbox 38 | ``` 39 | ## Main Results 40 | Results on nuScenes **val set**. The default batch size is 2 on each GPU. The FPS are all evaluated with a single Tesla A100 GPU. (15e + 5e means the last 5 epochs should be trained without [GTsample](https://github.com/Poley97/FSTR/blob/master/projects/configs/lidar/fstr_voxel0075_cbgs_20e.py.py#L33-L69)) 41 | 42 | | Config | mAP | NDS | Schedule|Inference FPS| 43 | |:--------:|:----------:|:---------:|:--------:|:--------:| 44 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 64.2% | 69.1% | 15e+5e | 15.4 | 45 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) | 65.5% | 70.3% | 15e+5e | 9.5 | 46 | 47 | 48 | Results on nuScenes **test set**. To reproduce our result, replace `ann_file=data_root + '/nuscenes_infos_train.pkl'` in [training config](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) with `ann_file=[data_root + '/nuscenes_infos_train.pkl', data_root + '/nuscenes_infos_val.pkl']`: 49 | 50 | | Config | mAP | NDS | Schedule|Inference FPS| 51 | |:--------:|:----------:|:---------:|:--------:|:--------:| 52 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 66.2% | 70.4% | 15e+5e | 15.4 | 53 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) +TTA | 67.6% | 71.5% | 15e+5e | - | 54 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) + TTA | 69.5% | 73.0% | 15e+5e | - | 55 | | [FSTR-XLarge](./projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py) + TTA | 70.2% | 73.5% | 15e+5e | - | 56 | 57 | ## Citation 58 | If you find our FSTR helpful in your research, please consider citing: 59 | ```bibtex 60 | @article{zhang2023fully, 61 | title={Fully Sparse Transformer 3D Detector for LiDAR Point Cloud}, 62 | author={Zhang, Diankun and Zheng, Zhijie and Niu, Haoyu and Wang, Xueqing and Liu, Xiaojun}, 63 | journal={IEEE Transactions on Geoscience and Remote Sensing}, 64 | year={2023}, 65 | publisher={IEEE} 66 | } 67 | ``` 68 | 69 | ## Contact 70 | If you have any questions, feel free to open an issue or contact us at zhangdiankun19@mails.ucas.edu.cn, or tanfeiyang@megvii.com. 71 | 72 | ## Acknowledgement 73 | Parts of our Code refer to the the recent work [CMT](https://github.com/junjie18/CMT). 74 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/custom_nuscenes_dataset.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import numpy as np 12 | from mmdet.datasets import DATASETS 13 | from mmdet3d.datasets import NuScenesDataset 14 | 15 | 16 | @DATASETS.register_module() 17 | class CustomNuScenesDataset(NuScenesDataset): 18 | r"""NuScenes Dataset. 19 | 20 | This datset only add camera intrinsics and extrinsics to the results. 21 | """ 22 | 23 | def __init__(self, *args, return_gt_info=False, **kwargs): 24 | super(CustomNuScenesDataset, self).__init__(*args, **kwargs) 25 | self.return_gt_info = return_gt_info 26 | 27 | def get_data_info(self, index): 28 | """Get data info according to the given index. 29 | 30 | Args: 31 | index (int): Index of the sample data to get. 32 | 33 | Returns: 34 | dict: Data information that will be passed to the data \ 35 | preprocessing pipelines. It includes the following keys: 36 | 37 | - sample_idx (str): Sample index. 38 | - pts_filename (str): Filename of point clouds. 39 | - sweeps (list[dict]): Infos of sweeps. 40 | - timestamp (float): Sample timestamp. 41 | - img_filename (str, optional): Image filename. 42 | - lidar2img (list[np.ndarray], optional): Transformations \ 43 | from lidar to different cameras. 44 | - ann_info (dict): Annotation info. 45 | """ 46 | info = self.data_infos[index] 47 | # standard protocal modified from SECOND.Pytorch 48 | input_dict = dict( 49 | sample_idx=info['token'], 50 | pts_filename=info['lidar_path'], 51 | sweeps=info['sweeps'], 52 | timestamp=info['timestamp'] / 1e6, 53 | img_sweeps=None if 'img_sweeps' not in info else info['img_sweeps'], 54 | radar_info=None if 'radars' not in info else info['radars'] 55 | ) 56 | 57 | if self.return_gt_info: 58 | input_dict['info'] = info 59 | 60 | if self.modality['use_camera']: 61 | image_paths = [] 62 | lidar2img_rts = [] 63 | lidar2cam_rts = [] 64 | cam_intrinsics = [] 65 | img_timestamp = [] 66 | for cam_type, cam_info in info['cams'].items(): 67 | img_timestamp.append(cam_info['timestamp'] / 1e6) 68 | image_paths.append(cam_info['data_path']) 69 | # obtain lidar to image transformation matrix 70 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) 71 | lidar2cam_t = cam_info[ 72 | 'sensor2lidar_translation'] @ lidar2cam_r.T 73 | lidar2cam_rt = np.eye(4) 74 | lidar2cam_rt[:3, :3] = lidar2cam_r.T 75 | lidar2cam_rt[3, :3] = -lidar2cam_t 76 | intrinsic = cam_info['cam_intrinsic'] 77 | viewpad = np.eye(4) 78 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic 79 | lidar2img_rt = (viewpad @ lidar2cam_rt.T) 80 | lidar2img_rts.append(lidar2img_rt) 81 | 82 | cam_intrinsics.append(viewpad) 83 | lidar2cam_rts.append(lidar2cam_rt.T) 84 | 85 | input_dict.update( 86 | dict( 87 | img_timestamp=img_timestamp, 88 | img_filename=image_paths, 89 | lidar2img=lidar2img_rts, 90 | cam_intrinsic=cam_intrinsics, 91 | lidar2cam=lidar2cam_rts, 92 | )) 93 | 94 | if not self.test_mode: 95 | annos = self.get_ann_info(index) 96 | input_dict['ann_info'] = annos 97 | 98 | return input_dict 99 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/multi_task_bbox_coder.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | 8 | import torch 9 | 10 | from mmdet.core.bbox import BaseBBoxCoder 11 | from mmdet.core.bbox.builder import BBOX_CODERS 12 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox 13 | 14 | 15 | @BBOX_CODERS.register_module() 16 | class MultiTaskBBoxCoder(BaseBBoxCoder): 17 | """Bbox coder for NMS-free detector. 18 | Args: 19 | pc_range (list[float]): Range of point cloud. 20 | post_center_range (list[float]): Limit of the center. 21 | Default: None. 22 | max_num (int): Max number to be kept. Default: 100. 23 | score_threshold (float): Threshold to filter boxes based on score. 24 | Default: None. 25 | code_size (int): Code size of bboxes. Default: 9 26 | """ 27 | 28 | def __init__(self, 29 | pc_range, 30 | voxel_size=None, 31 | post_center_range=None, 32 | max_num=100, 33 | score_threshold=None, 34 | num_classes=10): 35 | 36 | self.pc_range = pc_range 37 | self.voxel_size = voxel_size 38 | self.post_center_range = post_center_range 39 | self.max_num = max_num 40 | self.score_threshold = score_threshold 41 | self.num_classes = num_classes 42 | 43 | def encode(self): 44 | pass 45 | 46 | def decode_single(self, cls_scores, bbox_preds, task_ids): 47 | """Decode bboxes. 48 | Args: 49 | cls_scores (Tensor): Outputs from the classification head, \ 50 | shape [num_query, cls_out_channels]. Note \ 51 | cls_out_channels should includes background. 52 | bbox_preds (Tensor): Outputs from the regression \ 53 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 54 | Shape [num_query, 9]. 55 | Returns: 56 | list[dict]: Decoded boxes. 57 | """ 58 | max_num = self.max_num 59 | num_query = cls_scores.shape[0] 60 | 61 | cls_scores = cls_scores.sigmoid() 62 | scores, indexs = cls_scores.view(-1).topk(max_num) 63 | labels = indexs % self.num_classes 64 | bbox_index = indexs // self.num_classes 65 | task_index = torch.gather(task_ids, 1, labels.unsqueeze(1)).squeeze() 66 | 67 | bbox_preds = bbox_preds[task_index * num_query + bbox_index] 68 | 69 | final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) 70 | final_scores = scores 71 | final_preds = labels 72 | 73 | # use score threshold 74 | if self.score_threshold is not None: 75 | thresh_mask = final_scores > self.score_threshold 76 | if self.post_center_range is not None: 77 | self.post_center_range = torch.tensor( 78 | self.post_center_range, device=scores.device) 79 | mask = (final_box_preds[..., :3] >= 80 | self.post_center_range[:3]).all(1) 81 | mask &= (final_box_preds[..., :3] <= 82 | self.post_center_range[3:]).all(1) 83 | 84 | if self.score_threshold: 85 | mask &= thresh_mask 86 | 87 | boxes3d = final_box_preds[mask] 88 | scores = final_scores[mask] 89 | labels = final_preds[mask] 90 | predictions_dict = { 91 | 'bboxes': boxes3d, 92 | 'scores': scores, 93 | 'labels': labels 94 | } 95 | 96 | else: 97 | raise NotImplementedError( 98 | 'Need to reorganize output as a batch, only ' 99 | 'support post_center_range is not None for now!') 100 | return predictions_dict 101 | 102 | def decode(self, preds_dicts): 103 | """Decode bboxes. 104 | Args: 105 | all_cls_scores (Tensor): Outputs from the classification head, \ 106 | shape [nb_dec, bs, num_query, cls_out_channels]. Note \ 107 | cls_out_channels should includes background. 108 | all_bbox_preds (Tensor): Sigmoid outputs from the regression \ 109 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 110 | Shape [nb_dec, bs, num_query, 9]. 111 | Returns: 112 | list[dict]: Decoded boxes. 113 | """ 114 | task_num = len(preds_dicts) 115 | 116 | pred_bbox_list, pred_logits_list, task_ids_list = [], [], [] 117 | for task_id in range(task_num): 118 | task_pred_dict = preds_dicts[task_id][0] 119 | task_pred_bbox = torch.cat( 120 | (task_pred_dict['center'][-1], task_pred_dict['height'][-1], 121 | task_pred_dict['dim'][-1], task_pred_dict['rot'][-1], 122 | task_pred_dict['vel'][-1]), 123 | dim=-1 124 | ) 125 | task_pred_logits = task_pred_dict['cls_logits'][-1] 126 | pred_bbox_list.append(task_pred_bbox) 127 | pred_logits_list.append(task_pred_logits) 128 | 129 | task_ids = task_pred_logits.new_ones(task_pred_logits.shape).int() * task_id 130 | task_ids_list.append(task_ids) 131 | 132 | 133 | all_pred_logits = torch.cat(pred_logits_list, dim=-1) # bs * nq * 10 134 | all_pred_bbox = torch.cat(pred_bbox_list, dim=1) # bs * (task nq) * 10 135 | all_task_ids = torch.cat(task_ids_list, dim=-1) # bs * nq * 10 136 | 137 | batch_size = all_pred_logits.shape[0] 138 | predictions_list = [] 139 | for i in range(batch_size): 140 | predictions_list.append( 141 | self.decode_single(all_pred_logits[i], all_pred_bbox[i], all_task_ids[i])) 142 | return predictions_list -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.init import ( 7 | xavier_uniform_, 8 | constant_, 9 | xavier_normal_ 10 | ) 11 | from torch.nn.functional import linear 12 | 13 | from einops import rearrange 14 | from mmcv.runner import auto_fp16 15 | from mmcv.runner.base_module import BaseModule 16 | 17 | from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func 18 | from flash_attn.bert_padding import unpad_input, pad_input, index_first_axis 19 | 20 | 21 | def _in_projection_packed(q, k, v, w, b = None): 22 | w_q, w_k, w_v = w.chunk(3) 23 | if b is None: 24 | b_q = b_k = b_v = None 25 | else: 26 | b_q, b_k, b_v = b.chunk(3) 27 | return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) 28 | 29 | 30 | class FlashAttention(nn.Module): 31 | """Implement the scaled dot product attention with softmax. 32 | Arguments 33 | --------- 34 | softmax_scale: The temperature to use for the softmax attention. 35 | (default: 1/sqrt(d_keys) where d_keys is computed at 36 | runtime) 37 | attention_dropout: The dropout rate to apply to the attention 38 | (default: 0.1) 39 | """ 40 | def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): 41 | super().__init__() 42 | self.softmax_scale = softmax_scale 43 | self.dropout_p = attention_dropout 44 | self.fp16_enabled = True 45 | 46 | @auto_fp16(apply_to=('q', 'kv'), out_fp32=True) 47 | def forward(self, q, kv, 48 | causal=False, 49 | key_padding_mask=None): 50 | """Implements the multihead softmax attention. 51 | Arguments 52 | --------- 53 | q: The tensor containing the query. (B, T, H, D) 54 | kv: The tensor containing the key, and value. (B, S, 2, H, D) 55 | key_padding_mask: a bool tensor of shape (B, S) 56 | """ 57 | assert q.dtype in [torch.float16, torch.bfloat16] and kv.dtype in [torch.float16, torch.bfloat16] 58 | assert q.is_cuda and kv.is_cuda 59 | assert q.shape[0] == kv.shape[0] and q.shape[-2] == kv.shape[-2] and q.shape[-1] == kv.shape[-1] 60 | 61 | batch_size = q.shape[0] 62 | seqlen_q, seqlen_k = q.shape[1], kv.shape[1] 63 | if key_padding_mask is None: 64 | q, kv = rearrange(q, 'b s ... -> (b s) ...'), rearrange(kv, 'b s ... -> (b s) ...') 65 | max_sq, max_sk = seqlen_q, seqlen_k 66 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, 67 | device=q.device) 68 | cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, 69 | device=kv.device) 70 | output = flash_attn_unpadded_kvpacked_func( 71 | q, kv, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk, 72 | self.dropout_p if self.training else 0.0, 73 | softmax_scale=self.softmax_scale, causal=causal 74 | ) 75 | output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) 76 | else: 77 | nheads = kv.shape[-2] 78 | q = rearrange(q, 'b s ... -> (b s) ...') 79 | max_sq = seqlen_q 80 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, 81 | device=q.device) 82 | x = rearrange(kv, 'b s two h d -> b s (two h d)') 83 | x_unpad, indices, cu_seqlens_k, max_sk = unpad_input(x, key_padding_mask) 84 | x_unpad = rearrange(x_unpad, 'nnz (two h d) -> nnz two h d', two=2, h=nheads) 85 | output_unpad = flash_attn_unpadded_kvpacked_func( 86 | q, x_unpad, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk, 87 | self.dropout_p if self.training else 0.0, 88 | softmax_scale=self.softmax_scale, causal=causal 89 | ) 90 | output = rearrange(output_unpad, '(b s) ... -> b s ...', b=batch_size) 91 | 92 | return output, None 93 | 94 | 95 | class FlashMHA(nn.Module): 96 | 97 | def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, attention_dropout=0.0, 98 | causal=False, device=None, dtype=None, **kwargs) -> None: 99 | assert batch_first 100 | factory_kwargs = {'device': device, 'dtype': dtype} 101 | super().__init__() 102 | self.embed_dim = embed_dim 103 | self.causal = causal 104 | self.bias = bias 105 | 106 | self.num_heads = num_heads 107 | assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads" 108 | self.head_dim = self.embed_dim // num_heads 109 | assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8" 110 | 111 | self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim))) 112 | if bias: 113 | self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim)) 114 | else: 115 | self.register_parameter('in_proj_bias', None) 116 | self.inner_attn = FlashAttention(attention_dropout=attention_dropout, **factory_kwargs) 117 | self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) 118 | self._reset_parameters() 119 | 120 | def _reset_parameters(self) -> None: 121 | xavier_uniform_(self.in_proj_weight) 122 | if self.in_proj_bias is not None: 123 | constant_(self.in_proj_bias, 0.) 124 | constant_(self.out_proj.bias, 0.) 125 | 126 | def forward(self, q, k, v, key_padding_mask=None): 127 | """x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) 128 | key_padding_mask: bool tensor of shape (batch, seqlen) 129 | """ 130 | # q, k, v = self.Wq(q), self.Wk(k), self.Wv(v) 131 | q, k, v = _in_projection_packed(q, k, v, self.in_proj_weight, self.in_proj_bias) 132 | q = rearrange(q, 'b s (h d) -> b s h d', h=self.num_heads) 133 | k = rearrange(k, 'b s (h d) -> b s h d', h=self.num_heads) 134 | v = rearrange(v, 'b s (h d) -> b s h d', h=self.num_heads) 135 | kv = torch.stack([k, v], dim=2) 136 | 137 | context, attn_weights = self.inner_attn(q, kv, key_padding_mask=key_padding_mask, causal=self.causal) 138 | return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')), attn_weights 139 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2023 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import torch 12 | 13 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS 14 | from mmdet.core.bbox.assigners import AssignResult 15 | from mmdet.core.bbox.assigners import BaseAssigner 16 | from mmdet.core.bbox.match_costs import build_match_cost 17 | from mmdet.core.bbox.match_costs.builder import MATCH_COST 18 | from mmdet.core.bbox.iou_calculators import build_iou_calculator 19 | from mmdet.models.utils.transformer import inverse_sigmoid 20 | from scipy.optimize import linear_sum_assignment 21 | 22 | from projects.mmdet3d_plugin.core.bbox.util import ( 23 | normalize_bbox, 24 | denormalize_bbox 25 | ) 26 | 27 | 28 | @BBOX_ASSIGNERS.register_module() 29 | class HungarianAssigner3D(BaseAssigner): 30 | """Computes one-to-one matching between predictions and ground truth. 31 | This class computes an assignment between the targets and the predictions 32 | based on the costs. The costs are weighted sum of three components: 33 | classification cost, regression L1 cost and regression iou cost. The 34 | targets don't include the no_object, so generally there are more 35 | predictions than targets. After the one-to-one matching, the un-matched 36 | are treated as backgrounds. Thus each query prediction will be assigned 37 | with `0` or a positive integer indicating the ground truth index: 38 | - 0: negative sample, no assigned gt 39 | - positive integer: positive sample, index (1-based) of assigned gt 40 | Args: 41 | cls_weight (int | float, optional): The scale factor for classification 42 | cost. Default 1.0. 43 | bbox_weight (int | float, optional): The scale factor for regression 44 | L1 cost. Default 1.0. 45 | iou_weight (int | float, optional): The scale factor for regression 46 | iou cost. Default 1.0. 47 | iou_calculator (dict | optional): The config for the iou calculation. 48 | Default type `BboxOverlaps2D`. 49 | iou_mode (str | optional): "iou" (intersection over union), "iof" 50 | (intersection over foreground), or "giou" (generalized 51 | intersection over union). Default "giou". 52 | """ 53 | 54 | def __init__(self, 55 | cls_cost=dict(type='ClassificationCost', weight=1.), 56 | reg_cost=dict(type='BBoxL1Cost', weight=1.0), 57 | iou_cost=dict(type='IoUCost', weight=0.0), 58 | pc_range=None, 59 | code_weights=None): 60 | self.cls_cost = build_match_cost(cls_cost) 61 | self.reg_cost = build_match_cost(reg_cost) 62 | self.iou_cost = build_match_cost(iou_cost) 63 | self.pc_range = pc_range 64 | self.code_weights = code_weights 65 | if self.code_weights: 66 | self.code_weights = torch.tensor(self.code_weights)[None, :].cuda() 67 | 68 | def assign(self, 69 | bbox_pred, 70 | cls_pred, 71 | gt_bboxes, 72 | gt_labels, 73 | gt_bboxes_ignore=None, 74 | eps=1e-7, 75 | code_weights=None): 76 | """Computes one-to-one matching based on the weighted costs. 77 | This method assign each query prediction to a ground truth or 78 | background. The `assigned_gt_inds` with -1 means don't care, 79 | 0 means negative sample, and positive number is the index (1-based) 80 | of assigned gt. 81 | The assignment is done in the following steps, the order matters. 82 | 1. assign every prediction to -1 83 | 2. compute the weighted costs 84 | 3. do Hungarian matching on CPU based on the costs 85 | 4. assign all to 0 (background) first, then for each matched pair 86 | between predictions and gts, treat this prediction as foreground 87 | and assign the corresponding gt index (plus 1) to it. 88 | Args: 89 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 90 | (cx, cy, w, h), which are all in range [0, 1]. Shape 91 | [num_query, 4]. 92 | cls_pred (Tensor): Predicted classification logits, shape 93 | [num_query, num_class]. 94 | gt_bboxes (Tensor): Ground truth boxes with unnormalized 95 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 96 | gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). 97 | gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are 98 | labelled as `ignored`. Default None. 99 | eps (int | float, optional): A value added to the denominator for 100 | numerical stability. Default 1e-7. 101 | Returns: 102 | :obj:`AssignResult`: The assigned result. 103 | """ 104 | assert gt_bboxes_ignore is None, \ 105 | 'Only case when gt_bboxes_ignore is None is supported.' 106 | num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) 107 | 108 | # 1. assign -1 by default 109 | assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), 110 | -1, 111 | dtype=torch.long) 112 | assigned_labels = bbox_pred.new_full((num_bboxes, ), 113 | -1, 114 | dtype=torch.long) 115 | if num_gts == 0 or num_bboxes == 0: 116 | # No ground truth or boxes, return empty assignment 117 | if num_gts == 0: 118 | # No ground truth, assign all to background 119 | assigned_gt_inds[:] = 0 120 | return AssignResult( 121 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 122 | 123 | # 2. compute the weighted costs 124 | # classification and bboxcost. 125 | cls_cost = self.cls_cost(cls_pred, gt_labels) 126 | # regression L1 cost 127 | normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) 128 | 129 | if self.code_weights is not None: 130 | bbox_pred = bbox_pred * self.code_weights 131 | normalized_gt_bboxes = normalized_gt_bboxes * self.code_weights 132 | 133 | reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) 134 | 135 | # weighted sum of above two costs 136 | cost = cls_cost + reg_cost 137 | 138 | # 3. do Hungarian matching on CPU using linear_sum_assignment 139 | cost = cost.detach().cpu() 140 | if linear_sum_assignment is None: 141 | raise ImportError('Please run "pip install scipy" ' 142 | 'to install scipy first.') 143 | matched_row_inds, matched_col_inds = linear_sum_assignment(cost) 144 | matched_row_inds = torch.from_numpy(matched_row_inds).to( 145 | bbox_pred.device) 146 | matched_col_inds = torch.from_numpy(matched_col_inds).to( 147 | bbox_pred.device) 148 | 149 | # 4. assign backgrounds and foregrounds 150 | # assign all indices to backgrounds first 151 | assigned_gt_inds[:] = 0 152 | # assign foregrounds based on matching results 153 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 154 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] 155 | return AssignResult( 156 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 157 | 158 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/fstr.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 5 | # Copyright (c) OpenMMLab. All rights reserved. 6 | # ------------------------------------------------------------------------ 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | from mmcv.runner import force_fp32 12 | from mmdet.models import DETECTORS 13 | from mmdet3d.core import bbox3d2result 14 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector 15 | 16 | from projects.mmdet3d_plugin import SPConvVoxelization 17 | 18 | @DETECTORS.register_module() 19 | class FSTRDetector(MVXTwoStageDetector): 20 | 21 | def __init__(self, 22 | **kwargs): 23 | pts_voxel_cfg = kwargs.get('pts_voxel_layer', None) 24 | kwargs['pts_voxel_layer'] = None 25 | super(FSTRDetector, self).__init__(**kwargs) 26 | if pts_voxel_cfg: 27 | self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg) 28 | 29 | def init_weights(self): 30 | """Initialize model weights.""" 31 | super(FSTRDetector, self).init_weights() 32 | 33 | def extract_feat(self, points, img_metas): 34 | """Extract features from images and points.""" 35 | pts_feats = self.extract_pts_feat(points, img_metas) 36 | return pts_feats 37 | 38 | @force_fp32(apply_to=('pts')) 39 | def extract_pts_feat(self, pts, img_metas): 40 | """Extract features of points.""" 41 | if not self.with_pts_bbox: 42 | return None 43 | if pts is None: 44 | return None 45 | voxels, num_points, coors = self.voxelize(pts) 46 | voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, 47 | ) 48 | batch_size = coors[-1, 0] + 1 49 | x = self.pts_middle_encoder(voxel_features, coors, batch_size) 50 | return x 51 | 52 | @torch.no_grad() 53 | @force_fp32() 54 | def voxelize(self, points): 55 | """Apply dynamic voxelization to points. 56 | 57 | Args: 58 | points (list[torch.Tensor]): Points of each sample. 59 | 60 | Returns: 61 | tuple[torch.Tensor]: Concatenated points, number of points 62 | per voxel, and coordinates. 63 | """ 64 | voxels, coors, num_points = [], [], [] 65 | for res in points: 66 | res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) 67 | voxels.append(res_voxels) 68 | coors.append(res_coors) 69 | num_points.append(res_num_points) 70 | voxels = torch.cat(voxels, dim=0) 71 | num_points = torch.cat(num_points, dim=0) 72 | coors_batch = [] 73 | for i, coor in enumerate(coors): 74 | coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) 75 | coors_batch.append(coor_pad) 76 | coors_batch = torch.cat(coors_batch, dim=0) 77 | return voxels, num_points, coors_batch 78 | 79 | def forward_train(self, 80 | points=None, 81 | img_metas=None, 82 | gt_bboxes_3d=None, 83 | gt_labels_3d=None, 84 | gt_labels=None, 85 | gt_bboxes=None, 86 | proposals=None, 87 | gt_bboxes_ignore=None, 88 | **kwargs): 89 | """Forward training function. 90 | 91 | Args: 92 | points (list[torch.Tensor], optional): Points of each sample. 93 | Defaults to None. 94 | img_metas (list[dict], optional): Meta information of each sample. 95 | Defaults to None. 96 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): 97 | Ground truth 3D boxes. Defaults to None. 98 | gt_labels_3d (list[torch.Tensor], optional): Ground truth labels 99 | of 3D boxes. Defaults to None. 100 | gt_labels (list[torch.Tensor], optional): Ground truth labels 101 | of 2D boxes in images. Defaults to None. 102 | gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in 103 | images. Defaults to None. 104 | img (torch.Tensor optional): Images of each sample with shape 105 | (N, C, H, W). Defaults to None. 106 | proposals ([list[torch.Tensor], optional): Predicted proposals 107 | used for training Fast RCNN. Defaults to None. 108 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 109 | 2D boxes in images to be ignored. Defaults to None. 110 | 111 | Returns: 112 | dict: Losses of different branches. 113 | """ 114 | # nvtx.range_push('forward') 115 | # nvtx.range_push('voxel_backbone') 116 | pts_feats = self.extract_feat( 117 | points=points, img_metas=img_metas) 118 | # nvtx.range_pop() 119 | # nvtx.range_push('fstr_head') 120 | losses = dict() 121 | if pts_feats : 122 | losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d, 123 | gt_labels_3d, img_metas, 124 | gt_bboxes_ignore) 125 | losses.update(losses_pts) 126 | # nvtx.range_pop() 127 | # nvtx.range_pop() 128 | return losses 129 | 130 | @force_fp32(apply_to=('pts_feats')) 131 | def forward_pts_train(self, 132 | pts_feats, 133 | gt_bboxes_3d, 134 | gt_labels_3d, 135 | img_metas, 136 | gt_bboxes_ignore=None, 137 | ): 138 | """Forward function for point cloud branch. 139 | 140 | Args: 141 | pts_feats (list[torch.Tensor]): Features of point cloud branch 142 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth 143 | boxes for each sample. 144 | gt_labels_3d (list[torch.Tensor]): Ground truth labels for 145 | boxes of each sampole 146 | img_metas (list[dict]): Meta information of samples. 147 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 148 | boxes to be ignored. Defaults to None. 149 | 150 | Returns: 151 | dict: Losses of each branch. 152 | """ 153 | outs = self.pts_bbox_head(pts_feats, img_metas) 154 | loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] 155 | losses = self.pts_bbox_head.loss(*loss_inputs) 156 | return losses 157 | 158 | def forward_test(self, 159 | points=None, 160 | img_metas=None, 161 | **kwargs): 162 | """ 163 | Args: 164 | points (list[torch.Tensor]): the outer list indicates test-time 165 | augmentations and inner torch.Tensor should have a shape NxC, 166 | which contains all points in the batch. 167 | img_metas (list[list[dict]]): the outer list indicates test-time 168 | augs (multiscale, flip, etc.) and the inner list indicates 169 | images in a batch 170 | img (list[torch.Tensor], optional): the outer 171 | list indicates test-time augmentations and inner 172 | torch.Tensor should have a shape NxCxHxW, which contains 173 | all images in the batch. Defaults to None. 174 | """ 175 | if points is None: 176 | points = [None] 177 | for var, name in [(points, 'points'), (img_metas, 'img_metas')]: 178 | if not isinstance(var, list): 179 | raise TypeError('{} must be a list, but got {}'.format( 180 | name, type(var))) 181 | 182 | num_augs = len(points) 183 | if num_augs != len(img_metas): 184 | raise ValueError( 185 | 'num of augmentations ({}) != num of image meta ({})'.format( 186 | len(points), len(img_metas))) 187 | 188 | if num_augs == 1: 189 | return self.simple_test(points[0], img_metas[0],**kwargs) 190 | else: 191 | return self.aug_test(points, img_metas, **kwargs) 192 | 193 | @force_fp32(apply_to=('x')) 194 | def simple_test_pts(self, x, img_metas, rescale=False): 195 | """Test function of point cloud branch.""" 196 | outs = self.pts_bbox_head(x, img_metas) 197 | bbox_list = self.pts_bbox_head.get_bboxes( 198 | outs, img_metas, rescale=rescale) 199 | bbox_results = [ 200 | bbox3d2result(bboxes, scores, labels) 201 | for bboxes, scores, labels in bbox_list 202 | ] 203 | return bbox_results 204 | 205 | def simple_test(self, points, img_metas, rescale=False): 206 | """Test function without augmentaiton.""" 207 | 208 | pts_feats = self.extract_feat( 209 | points, img_metas=img_metas) 210 | bbox_list = [dict() for i in range(len(img_metas))] 211 | if self.with_pts_bbox: 212 | bbox_pts = self.simple_test_pts( 213 | pts_feats, img_metas, rescale=rescale) 214 | for result_dict, pts_bbox in zip(bbox_list, bbox_pts): 215 | result_dict['pts_bbox'] = pts_bbox 216 | return bbox_list 217 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/voxelnext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmcv.runner import auto_fp16 3 | from torch import nn as nn 4 | 5 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule 6 | # from mmdet3d.ops import spconv as spconv 7 | from mmdet3d.models.builder import MIDDLE_ENCODERS 8 | import torch 9 | from mmcv.cnn import build_conv_layer, build_norm_layer 10 | from torch import nn 11 | 12 | # from mmdet3d.ops import spconv 13 | import spconv.pytorch as spconv 14 | from mmdet.models.backbones.resnet import BasicBlock, Bottleneck 15 | 16 | 17 | @MIDDLE_ENCODERS.register_module() 18 | class VoxelNextEncoder(nn.Module): 19 | r"""Sparse encoder for SECOND and Part-A2. 20 | 21 | Args: 22 | in_channels (int): The number of input channels. 23 | sparse_shape (list[int]): The sparse shape of input tensor. 24 | order (list[str]): Order of conv module. Defaults to ('conv', 25 | 'norm', 'act'). 26 | norm_cfg (dict): Config of normalization layer. Defaults to 27 | dict(type='BN1d', eps=1e-3, momentum=0.01). 28 | base_channels (int): Out channels for conv_input layer. 29 | Defaults to 16. 30 | output_channels (int): Out channels for conv_out layer. 31 | Defaults to 128. 32 | encoder_channels (tuple[tuple[int]]): 33 | Convolutional channels of each encode block. 34 | encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. 35 | Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). 36 | block_type (str): Type of the block to use. Defaults to 'conv_module'. 37 | """ 38 | 39 | def __init__(self, 40 | in_channels, 41 | sparse_shape, 42 | order=('conv', 'norm', 'act'), 43 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 44 | base_channels=16, 45 | output_channels=128, 46 | encoder_channels=((16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128),(128, 128, 128),(128, 128, 128)), 47 | encoder_paddings=((1, 1 ), (1, 1, 1), (1, 1, 1), (1, 1, 1),(1, 1, 1),(1, 1, 1)), 48 | sparse_conv_kernel = (3, 3, 3, 3, 3), 49 | block_type='basicblock'): 50 | super().__init__() 51 | assert block_type in ['conv_module', 'basicblock'] 52 | self.sparse_shape = sparse_shape 53 | self.in_channels = in_channels 54 | self.order = order 55 | self.base_channels = base_channels 56 | self.output_channels = output_channels 57 | self.encoder_channels = encoder_channels 58 | self.encoder_paddings = encoder_paddings 59 | self.stage_num = len(self.encoder_channels) 60 | self.sparse_conv_kernel = sparse_conv_kernel 61 | self.fp16_enabled = False 62 | # Spconv init all weight on its own 63 | 64 | assert isinstance(order, tuple) and len(order) == 3 65 | assert set(order) == {'conv', 'norm', 'act'} 66 | 67 | if self.order[0] != 'conv': # pre activate 68 | self.conv_input = make_sparse_convmodule( 69 | in_channels, 70 | self.base_channels, 71 | 3, 72 | norm_cfg=norm_cfg, 73 | padding=1, 74 | indice_key='subm1', 75 | conv_type='SubMConv3d', 76 | order=('conv', )) 77 | else: # post activate 78 | self.conv_input = make_sparse_convmodule( 79 | in_channels, 80 | self.base_channels, 81 | 3, 82 | norm_cfg=norm_cfg, 83 | padding=1, 84 | indice_key='subm1', 85 | conv_type='SubMConv3d') 86 | 87 | encoder_out_channels = self.make_encoder_layers( 88 | make_sparse_convmodule, 89 | norm_cfg, 90 | self.base_channels, 91 | block_type=block_type) 92 | 93 | self.conv_out = make_sparse_convmodule( 94 | encoder_out_channels, 95 | self.output_channels, 96 | kernel_size=3, 97 | stride=1, 98 | norm_cfg=norm_cfg, 99 | padding=1, 100 | indice_key='spconv_down2', 101 | conv_type='SparseConv2d') 102 | 103 | self.shared_out = make_sparse_convmodule( 104 | self.output_channels, 105 | self.output_channels, 106 | kernel_size=3, 107 | stride=1, 108 | norm_cfg=norm_cfg, 109 | padding=1, 110 | indice_key='spconv_out', 111 | conv_type='SubMConv2d') 112 | 113 | @auto_fp16(apply_to=('voxel_features', )) 114 | def forward(self, voxel_features, coors, batch_size): 115 | """Forward of SparseEncoder. 116 | 117 | Args: 118 | voxel_features (torch.float32): Voxel features in shape (N, C). 119 | coors (torch.int32): Coordinates in shape (N, 4), \ 120 | the columns in the order of (batch_idx, z_idx, y_idx, x_idx). 121 | batch_size (int): Batch size. 122 | 123 | Returns: 124 | dict: Backbone features. 125 | """ 126 | coors = coors.int() 127 | input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, 128 | self.sparse_shape, 129 | batch_size) 130 | x = self.conv_input(input_sp_tensor) 131 | 132 | encode_features = [] 133 | for encoder_layer in self.encoder_layers: 134 | x = encoder_layer(x) 135 | encode_features.append(x) 136 | 137 | encode_features[4].indices[:, 1:] *= 2 138 | encode_features[5].indices[:, 1:] *= 4 139 | encode_features[3] = encode_features[3].replace_feature(torch.cat([encode_features[3].features, encode_features[4].features, encode_features[5].features])) 140 | encode_features[3].indices = torch.cat([ encode_features[3].indices, encode_features[4].indices, encode_features[5].indices]) 141 | 142 | out = self.bev_out(encode_features[3]) 143 | out = self.conv_out(out) 144 | out = self.shared_out(out) 145 | 146 | 147 | return [out] 148 | 149 | def bev_out(self, x_conv): 150 | features_cat = x_conv.features 151 | indices_cat = x_conv.indices[:, [0, 2, 3]] 152 | spatial_shape = x_conv.spatial_shape[1:] 153 | 154 | indices_unique, _inv = torch.unique(indices_cat, dim=0, return_inverse=True) 155 | features_unique = features_cat.new_zeros((indices_unique.shape[0], features_cat.shape[1])) 156 | features_unique.index_add_(0, _inv, features_cat) 157 | 158 | x_out = spconv.SparseConvTensor( 159 | features=features_unique, 160 | indices=indices_unique, 161 | spatial_shape=spatial_shape, 162 | batch_size=x_conv.batch_size 163 | ) 164 | return x_out 165 | def make_encoder_layers(self, 166 | make_block, 167 | norm_cfg, 168 | in_channels, 169 | block_type='conv_module', 170 | conv_cfg=dict(type='SubMConv3d')): 171 | """make encoder layers using sparse convs. 172 | 173 | Args: 174 | make_block (method): A bounded function to build blocks. 175 | norm_cfg (dict[str]): Config of normalization layer. 176 | in_channels (int): The number of encoder input channels. 177 | block_type (str): Type of the block to use. Defaults to 178 | 'conv_module'. 179 | conv_cfg (dict): Config of conv layer. Defaults to 180 | dict(type='SubMConv3d'). 181 | 182 | Returns: 183 | int: The number of encoder output channels. 184 | """ 185 | assert block_type in ['conv_module', 'basicblock'] 186 | self.encoder_layers = spconv.SparseSequential() 187 | 188 | for i, blocks in enumerate(self.encoder_channels): 189 | blocks_list = [] 190 | for j, out_channels in enumerate(tuple(blocks)): 191 | padding = tuple(self.encoder_paddings[i])[j] 192 | # each stage started with a spconv layer 193 | # except the first stage 194 | if i != 0 and j == 0 and block_type == 'conv_module': 195 | blocks_list.append( 196 | make_block( 197 | in_channels, 198 | out_channels, 199 | 3, 200 | norm_cfg=norm_cfg, 201 | stride=2, 202 | padding=padding, 203 | indice_key=f'spconv{i + 1}', 204 | conv_type='SparseConv3d')) 205 | elif block_type == 'basicblock': 206 | if j == 0 and len(blocks) > 2: 207 | blocks_list.append( 208 | make_block( 209 | in_channels, 210 | out_channels, 211 | self.sparse_conv_kernel[i - 1], 212 | norm_cfg=norm_cfg, 213 | stride=2, 214 | padding=int(self.sparse_conv_kernel[i - 1]//2), 215 | indice_key=f'spconv{i + 1}', 216 | conv_type='SparseConv3d')) 217 | else: 218 | blocks_list.append( 219 | SparseBasicBlock( 220 | out_channels, 221 | out_channels, 222 | norm_cfg=norm_cfg, 223 | conv_cfg=conv_cfg)) 224 | else: 225 | blocks_list.append( 226 | make_block( 227 | in_channels, 228 | out_channels, 229 | 3, 230 | norm_cfg=norm_cfg, 231 | padding=padding, 232 | indice_key=f'subm{i + 1}', 233 | conv_type='SubMConv3d')) 234 | in_channels = out_channels 235 | stage_name = f'encoder_layer{i + 1}' 236 | stage_layers = spconv.SparseSequential(*blocks_list) 237 | self.encoder_layers.add_module(stage_name, stage_layers) 238 | return out_channels -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CMT 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | Copyright (c) 2023 Megvii Inc. All rights reserved. 181 | 182 | Licensed under the Apache License, Version 2.0 (the "License"); 183 | you may not use this file except in compliance with the License. 184 | You may obtain a copy of the License at 185 | 186 | http://www.apache.org/licenses/LICENSE-2.0 187 | 188 | Unless required by applicable law or agreed to in writing, software 189 | distributed under the License is distributed on an "AS IS" BASIS, 190 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 191 | See the License for the specific language governing permissions and 192 | limitations under the License. 193 | -------------------------------------------------------------------------------- /projects/configs/lidar/fstr_voxel0075_cbgs_20e.py: -------------------------------------------------------------------------------- 1 | plugin=True 2 | plugin_dir='projects/mmdet3d_plugin/' 3 | 4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] 5 | class_names = [ 6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 8 | ] 9 | voxel_size = [0.075, 0.075, 0.2] 10 | out_size_factor = 8 11 | evaluation = dict(interval=20) 12 | dataset_type = 'CustomNuScenesDataset' 13 | data_root = 'data/nuscenes/' 14 | input_modality = dict( 15 | use_lidar=True, 16 | use_camera=False, 17 | use_radar=False, 18 | use_map=False, 19 | use_external=False) 20 | train_pipeline = [ 21 | dict( 22 | type='LoadPointsFromFile', 23 | coord_type='LIDAR', 24 | load_dim=5, 25 | use_dim=[0, 1, 2, 3, 4], 26 | ), 27 | dict( 28 | type='LoadPointsFromMultiSweeps', 29 | sweeps_num=10, 30 | use_dim=[0, 1, 2, 3, 4], 31 | ), 32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), 33 | dict( 34 | type='ObjectSample', 35 | db_sampler=dict( 36 | data_root='data/nuscenes/', 37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl', 38 | rate=1.0, 39 | prepare=dict( 40 | filter_by_difficulty=[-1], 41 | filter_by_min_points=dict( 42 | car=5, 43 | truck=5, 44 | bus=5, 45 | trailer=5, 46 | construction_vehicle=5, 47 | traffic_cone=5, 48 | barrier=5, 49 | motorcycle=5, 50 | bicycle=5, 51 | pedestrian=5)), 52 | classes=class_names, 53 | sample_groups=dict( 54 | car=2, 55 | truck=3, 56 | construction_vehicle=7, 57 | bus=4, 58 | trailer=6, 59 | barrier=2, 60 | motorcycle=6, 61 | bicycle=6, 62 | pedestrian=2, 63 | traffic_cone=2), 64 | points_loader=dict( 65 | type='LoadPointsFromFile', 66 | coord_type='LIDAR', 67 | load_dim=5, 68 | use_dim=[0, 1, 2, 3, 4], 69 | ))), 70 | dict( 71 | type='GlobalRotScaleTrans', 72 | rot_range=[-0.3925 * 2, 0.3925 * 2], 73 | scale_ratio_range=[0.9, 1.1], 74 | translation_std=[0.5, 0.5, 0.5]), 75 | dict( 76 | type='RandomFlip3D', 77 | sync_2d=False, 78 | flip_ratio_bev_horizontal=0.5, 79 | flip_ratio_bev_vertical=0.5), 80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 82 | dict(type='ObjectNameFilter', classes=class_names), 83 | dict(type='PointShuffle'), 84 | dict(type='DefaultFormatBundle3D', class_names=class_names), 85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'], 86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 87 | 'depth2img', 'cam2img', 'pad_shape', 88 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 92 | 'transformation_3d_flow', 'rot_degree', 93 | 'gt_bboxes_3d', 'gt_labels_3d')) 94 | ] 95 | test_pipeline = [ 96 | dict( 97 | type='LoadPointsFromFile', 98 | coord_type='LIDAR', 99 | load_dim=5, 100 | use_dim=[0, 1, 2, 3, 4], 101 | ), 102 | dict( 103 | type='LoadPointsFromMultiSweeps', 104 | sweeps_num=10, 105 | use_dim=[0, 1, 2, 3, 4], 106 | ), 107 | dict( 108 | type='MultiScaleFlipAug3D', 109 | img_scale=(1333, 800), 110 | pts_scale_ratio=1, 111 | flip=False, 112 | transforms=[ 113 | dict( 114 | type='GlobalRotScaleTrans', 115 | rot_range=[0, 0], 116 | scale_ratio_range=[1.0, 1.0], 117 | translation_std=[0, 0, 0]), 118 | dict(type='RandomFlip3D'), 119 | dict( 120 | type='DefaultFormatBundle3D', 121 | class_names=class_names, 122 | with_label=False), 123 | dict(type='Collect3D', keys=['points']) 124 | ]) 125 | ] 126 | data = dict( 127 | samples_per_gpu=2, 128 | workers_per_gpu=4, 129 | train=dict( 130 | type='CBGSDataset', 131 | dataset=dict( 132 | type=dataset_type, 133 | data_root=data_root, 134 | ann_file=data_root + '/nuscenes_infos_train.pkl', 135 | load_interval=1, 136 | pipeline=train_pipeline, 137 | classes=class_names, 138 | modality=input_modality, 139 | test_mode=False, 140 | box_type_3d='LiDAR')), 141 | val=dict( 142 | type=dataset_type, 143 | data_root=data_root, 144 | ann_file=data_root + '/nuscenes_infos_val.pkl', 145 | load_interval=1, 146 | pipeline=test_pipeline, 147 | classes=class_names, 148 | modality=input_modality, 149 | test_mode=True, 150 | box_type_3d='LiDAR'), 151 | test=dict( 152 | type=dataset_type, 153 | data_root=data_root, 154 | ann_file=data_root + '/nuscenes_infos_val.pkl', 155 | load_interval=1, 156 | pipeline=test_pipeline, 157 | classes=class_names, 158 | modality=input_modality, 159 | test_mode=True, 160 | box_type_3d='LiDAR')) 161 | model = dict( 162 | type='FSTRDetector', 163 | pts_voxel_layer=dict( 164 | num_point_features=5, 165 | max_num_points=10, 166 | voxel_size=voxel_size, 167 | max_voxels=(120000, 160000), 168 | point_cloud_range=point_cloud_range), 169 | pts_voxel_encoder=dict( 170 | type='HardSimpleVFE', 171 | num_features=5, 172 | ), 173 | pts_middle_encoder=dict( 174 | type='VoxelNextEncoder', 175 | in_channels=5, 176 | sparse_shape=[41, 1440, 1440], 177 | base_channels=16, 178 | output_channels=128, 179 | order=('conv', 'norm', 'act'), 180 | block_type='basicblock'), 181 | 182 | pts_bbox_head=dict( 183 | type='FSTRHead', 184 | in_channels=128, 185 | hidden_dim=256, 186 | downsample_scale=8, 187 | num_query=500, 188 | num_init_query=200, 189 | init_dn_query = False, 190 | init_learnable_query = False, 191 | init_query_topk = 1, 192 | init_query_radius = 1, 193 | gauusian_dn_sampling=False, 194 | noise_mean = 0.5, 195 | noise_std = 0.125, 196 | max_sparse_token_per_sample = 10000, 197 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), 198 | tasks=[ 199 | dict(num_class=10, class_names=[ 200 | 'car', 'truck', 'construction_vehicle', 201 | 'bus', 'trailer', 'barrier', 202 | 'motorcycle', 'bicycle', 203 | 'pedestrian', 'traffic_cone' 204 | ]), 205 | ], 206 | bbox_coder=dict( 207 | type='MultiTaskBBoxCoder', 208 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 209 | pc_range=point_cloud_range, 210 | max_num=300, 211 | voxel_size=voxel_size, 212 | num_classes=10), 213 | separate_head=dict( 214 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3), 215 | transformer=dict( 216 | type='FSTRTransformer', 217 | decoder=dict( 218 | type='PETRTransformerDecoder', 219 | return_intermediate=True, 220 | num_layers=1, 221 | transformerlayers=dict( 222 | type='PETRTransformerDecoderLayer', 223 | attn_cfgs=[ 224 | dict( 225 | type='MultiheadAttention', 226 | embed_dims=256, 227 | num_heads=8, 228 | dropout=0.1), 229 | dict( 230 | type='PETRMultiheadFlashAttention', 231 | embed_dims=256, 232 | num_heads=8, 233 | dropout=0.1), 234 | ], 235 | ffn_cfgs=dict( 236 | type='FFN', 237 | embed_dims=256, 238 | feedforward_channels=1024, 239 | num_fcs=2, 240 | ffn_drop=0., 241 | act_cfg=dict(type='ReLU', inplace=True), 242 | ), 243 | 244 | feedforward_channels=1024, #unused 245 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 246 | 'ffn', 'norm')), 247 | )), 248 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0), 249 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), 250 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), 251 | ), 252 | train_cfg=dict( 253 | pts=dict( 254 | dataset='nuScenes', 255 | assigner=dict( 256 | type='HungarianAssigner3D', 257 | cls_cost=dict(type='FocalLossCost', weight=2.0), 258 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 259 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 260 | pc_range=point_cloud_range, 261 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 262 | ), 263 | pos_weight=-1, 264 | gaussian_overlap=0.1, 265 | min_radius=2, 266 | grid_size=[1440, 1440, 40], # [x_len, y_len, 1] 267 | voxel_size=voxel_size, 268 | out_size_factor=out_size_factor, 269 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 270 | point_cloud_range=point_cloud_range)), 271 | test_cfg=dict( 272 | pts=dict( 273 | dataset='nuScenes', 274 | grid_size=[1440, 1440, 40], 275 | out_size_factor=out_size_factor, 276 | pc_range=point_cloud_range[0:2], 277 | voxel_size=voxel_size[:2], 278 | nms_type=None, 279 | nms_thr=0.1, 280 | use_rotate_nms=True, 281 | max_num=300 282 | ))) 283 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu 284 | optimizer_config = dict( 285 | type='CustomFp16OptimizerHook', 286 | loss_scale=512., 287 | grad_clip=dict(max_norm=35, norm_type=2), 288 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False)) 289 | lr_config = dict( 290 | policy='cyclic', 291 | target_ratio=(8, 0.0001), 292 | cyclic_times=1, 293 | step_ratio_up=0.4) 294 | momentum_config = dict( 295 | policy='cyclic', 296 | target_ratio=(0.8947368421052632, 1), 297 | cyclic_times=1, 298 | step_ratio_up=0.4) 299 | total_epochs = 20 300 | checkpoint_config = dict(interval=1) 301 | evaluation = dict(interval=5, pipeline=test_pipeline) 302 | log_config = dict( 303 | interval=50, 304 | hooks=[dict(type='TextLoggerHook'), 305 | dict(type='TensorboardLoggerHook')]) 306 | dist_params = dict(backend='nccl') 307 | log_level = 'INFO' 308 | work_dir = None 309 | load_from = None 310 | resume_from = None 311 | workflow = [('train', 1)] 312 | gpu_ids = range(0, 8) 313 | 314 | -------------------------------------------------------------------------------- /tools/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import warnings 5 | 6 | import mmcv 7 | import torch 8 | from mmcv import Config, DictAction 9 | from mmcv.cnn import fuse_conv_bn 10 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 11 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, 12 | wrap_fp16_model) 13 | 14 | import mmdet 15 | from mmdet3d.apis import single_gpu_test 16 | from mmdet3d.datasets import build_dataloader, build_dataset 17 | from mmdet3d.models import build_model 18 | from mmdet.apis import multi_gpu_test, set_random_seed 19 | from mmdet.datasets import replace_ImageToTensor 20 | 21 | if mmdet.__version__ > '2.23.0': 22 | # If mmdet version > 2.23.0, setup_multi_processes would be imported and 23 | # used from mmdet instead of mmdet3d. 24 | from mmdet.utils import setup_multi_processes 25 | else: 26 | from mmdet3d.utils import setup_multi_processes 27 | 28 | try: 29 | # If mmdet version > 2.23.0, compat_cfg would be imported and 30 | # used from mmdet instead of mmdet3d. 31 | from mmdet.utils import compat_cfg 32 | except ImportError: 33 | from mmdet3d.utils import compat_cfg 34 | 35 | 36 | def parse_args(): 37 | parser = argparse.ArgumentParser( 38 | description='MMDet test (and eval) a model') 39 | parser.add_argument('config', help='test config file path') 40 | parser.add_argument('checkpoint', help='checkpoint file') 41 | parser.add_argument('--out', help='output result file in pickle format') 42 | parser.add_argument( 43 | '--fuse-conv-bn', 44 | action='store_true', 45 | help='Whether to fuse conv and bn, this will slightly increase' 46 | 'the inference speed') 47 | parser.add_argument( 48 | '--gpu-ids', 49 | type=int, 50 | nargs='+', 51 | help='(Deprecated, please use --gpu-id) ids of gpus to use ' 52 | '(only applicable to non-distributed training)') 53 | parser.add_argument( 54 | '--gpu-id', 55 | type=int, 56 | default=0, 57 | help='id of gpu to use ' 58 | '(only applicable to non-distributed testing)') 59 | parser.add_argument( 60 | '--format-only', 61 | action='store_true', 62 | help='Format the output results without perform evaluation. It is' 63 | 'useful when you want to format the result to a specific format and ' 64 | 'submit it to the test server') 65 | parser.add_argument( 66 | '--eval', 67 | type=str, 68 | nargs='+', 69 | help='evaluation metrics, which depends on the dataset, e.g., "bbox",' 70 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 71 | parser.add_argument('--show', action='store_true', help='show results') 72 | parser.add_argument( 73 | '--show-dir', help='directory where results will be saved') 74 | parser.add_argument( 75 | '--gpu-collect', 76 | action='store_true', 77 | help='whether to use gpu to collect results.') 78 | parser.add_argument( 79 | '--tmpdir', 80 | help='tmp directory used for collecting results from multiple ' 81 | 'workers, available when gpu-collect is not specified') 82 | parser.add_argument('--seed', type=int, default=0, help='random seed') 83 | parser.add_argument( 84 | '--deterministic', 85 | action='store_true', 86 | help='whether to set deterministic options for CUDNN backend.') 87 | parser.add_argument( 88 | '--cfg-options', 89 | nargs='+', 90 | action=DictAction, 91 | help='override some settings in the used config, the key-value pair ' 92 | 'in xxx=yyy format will be merged into config file. If the value to ' 93 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 94 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 95 | 'Note that the quotation marks are necessary and that no white space ' 96 | 'is allowed.') 97 | parser.add_argument( 98 | '--options', 99 | nargs='+', 100 | action=DictAction, 101 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 102 | 'format will be kwargs for dataset.evaluate() function (deprecate), ' 103 | 'change to --eval-options instead.') 104 | parser.add_argument( 105 | '--eval-options', 106 | nargs='+', 107 | action=DictAction, 108 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 109 | 'format will be kwargs for dataset.evaluate() function') 110 | parser.add_argument( 111 | '--launcher', 112 | choices=['none', 'pytorch', 'slurm', 'mpi'], 113 | default='none', 114 | help='job launcher') 115 | parser.add_argument('--local_rank', type=int, default=0) 116 | args = parser.parse_args() 117 | if 'LOCAL_RANK' not in os.environ: 118 | os.environ['LOCAL_RANK'] = str(args.local_rank) 119 | 120 | if args.options and args.eval_options: 121 | raise ValueError( 122 | '--options and --eval-options cannot be both specified, ' 123 | '--options is deprecated in favor of --eval-options') 124 | if args.options: 125 | warnings.warn('--options is deprecated in favor of --eval-options') 126 | args.eval_options = args.options 127 | return args 128 | 129 | 130 | def main(): 131 | args = parse_args() 132 | 133 | assert args.out or args.eval or args.format_only or args.show \ 134 | or args.show_dir, \ 135 | ('Please specify at least one operation (save/eval/format/show the ' 136 | 'results / save the results) with the argument "--out", "--eval"' 137 | ', "--format-only", "--show" or "--show-dir"') 138 | 139 | if args.eval and args.format_only: 140 | raise ValueError('--eval and --format_only cannot be both specified') 141 | 142 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 143 | raise ValueError('The output file must be a pkl file.') 144 | 145 | cfg = Config.fromfile(args.config) 146 | if args.cfg_options is not None: 147 | cfg.merge_from_dict(args.cfg_options) 148 | 149 | # import modules from string list. 150 | if cfg.get('custom_imports', None): 151 | from mmcv.utils import import_modules_from_strings 152 | import_modules_from_strings(**cfg['custom_imports']) 153 | 154 | # import modules from plguin/xx, registry will be updated 155 | if hasattr(cfg, 'plugin'): 156 | if cfg.plugin: 157 | import importlib 158 | if hasattr(cfg, 'plugin_dir'): 159 | plugin_dir = cfg.plugin_dir 160 | _module_dir = os.path.dirname(plugin_dir) 161 | _module_dir = _module_dir.split('/') 162 | _module_path = _module_dir[0] 163 | 164 | for m in _module_dir[1:]: 165 | _module_path = _module_path + '.' + m 166 | print(_module_path) 167 | plg_lib = importlib.import_module(_module_path) 168 | else: 169 | # import dir is the dirpath for the config file 170 | _module_dir = os.path.dirname(args.config) 171 | _module_dir = _module_dir.split('/') 172 | _module_path = _module_dir[0] 173 | for m in _module_dir[1:]: 174 | _module_path = _module_path + '.' + m 175 | print(_module_path) 176 | plg_lib = importlib.import_module(_module_path) 177 | 178 | cfg = compat_cfg(cfg) 179 | 180 | # set multi-process settings 181 | setup_multi_processes(cfg) 182 | 183 | # set cudnn_benchmark 184 | if cfg.get('cudnn_benchmark', False): 185 | torch.backends.cudnn.benchmark = True 186 | 187 | cfg.model.pretrained = None 188 | 189 | if args.gpu_ids is not None: 190 | cfg.gpu_ids = args.gpu_ids[0:1] 191 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 192 | 'Because we only support single GPU mode in ' 193 | 'non-distributed testing. Use the first GPU ' 194 | 'in `gpu_ids` now.') 195 | else: 196 | cfg.gpu_ids = [args.gpu_id] 197 | 198 | # init distributed env first, since logger depends on the dist info. 199 | if args.launcher == 'none': 200 | distributed = False 201 | else: 202 | distributed = True 203 | init_dist(args.launcher, **cfg.dist_params) 204 | 205 | test_dataloader_default_args = dict( 206 | samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False) 207 | 208 | # in case the test dataset is concatenated 209 | if isinstance(cfg.data.test, dict): 210 | cfg.data.test.test_mode = True 211 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: 212 | # Replace 'ImageToTensor' to 'DefaultFormatBundle' 213 | cfg.data.test.pipeline = replace_ImageToTensor( 214 | cfg.data.test.pipeline) 215 | elif isinstance(cfg.data.test, list): 216 | for ds_cfg in cfg.data.test: 217 | ds_cfg.test_mode = True 218 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: 219 | for ds_cfg in cfg.data.test: 220 | ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) 221 | 222 | test_loader_cfg = { 223 | **test_dataloader_default_args, 224 | **cfg.data.get('test_dataloader', {}) 225 | } 226 | 227 | # set random seeds 228 | if args.seed is not None: 229 | set_random_seed(args.seed, deterministic=args.deterministic) 230 | 231 | # build the dataloader 232 | dataset = build_dataset(cfg.data.test) 233 | data_loader = build_dataloader(dataset, **test_loader_cfg) 234 | 235 | # build the model and load checkpoint 236 | cfg.model.train_cfg = None 237 | model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) 238 | fp16_cfg = cfg.get('fp16', None) 239 | if fp16_cfg is not None: 240 | wrap_fp16_model(model) 241 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 242 | if args.fuse_conv_bn: 243 | model = fuse_conv_bn(model) 244 | # old versions did not save class info in checkpoints, this walkaround is 245 | # for backward compatibility 246 | if 'CLASSES' in checkpoint.get('meta', {}): 247 | model.CLASSES = checkpoint['meta']['CLASSES'] 248 | else: 249 | model.CLASSES = dataset.CLASSES 250 | # palette for visualization in segmentation tasks 251 | if 'PALETTE' in checkpoint.get('meta', {}): 252 | model.PALETTE = checkpoint['meta']['PALETTE'] 253 | elif hasattr(dataset, 'PALETTE'): 254 | # segmentation dataset has `PALETTE` attribute 255 | model.PALETTE = dataset.PALETTE 256 | 257 | if not distributed: 258 | model = MMDataParallel(model, device_ids=cfg.gpu_ids) 259 | outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) 260 | else: 261 | model = MMDistributedDataParallel( 262 | model.cuda(), 263 | device_ids=[torch.cuda.current_device()], 264 | broadcast_buffers=False) 265 | outputs = multi_gpu_test(model, data_loader, args.tmpdir, 266 | args.gpu_collect) 267 | 268 | rank, _ = get_dist_info() 269 | if rank == 0: 270 | if args.out: 271 | print(f'\nwriting results to {args.out}') 272 | mmcv.dump(outputs, args.out) 273 | kwargs = {} if args.eval_options is None else args.eval_options 274 | if args.format_only: 275 | dataset.format_results(outputs, **kwargs) 276 | if args.eval: 277 | eval_kwargs = cfg.get('evaluation', {}).copy() 278 | # hard-code way to remove EvalHook args 279 | for key in [ 280 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 281 | 'rule' 282 | ]: 283 | eval_kwargs.pop(key, None) 284 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 285 | print(dataset.evaluate(outputs, **eval_kwargs)) 286 | 287 | 288 | if __name__ == '__main__': 289 | main() 290 | -------------------------------------------------------------------------------- /projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py: -------------------------------------------------------------------------------- 1 | plugin=True 2 | plugin_dir='projects/mmdet3d_plugin/' 3 | 4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] 5 | class_names = [ 6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 8 | ] 9 | voxel_size = [0.075, 0.075, 0.2] 10 | out_size_factor = 8 11 | evaluation = dict(interval=20) 12 | dataset_type = 'CustomNuScenesDataset' 13 | data_root = 'data/nuscenes/' 14 | input_modality = dict( 15 | use_lidar=True, 16 | use_camera=False, 17 | use_radar=False, 18 | use_map=False, 19 | use_external=False) 20 | train_pipeline = [ 21 | dict( 22 | type='LoadPointsFromFile', 23 | coord_type='LIDAR', 24 | load_dim=5, 25 | use_dim=[0, 1, 2, 3, 4], 26 | ), 27 | dict( 28 | type='LoadPointsFromMultiSweeps', 29 | sweeps_num=10, 30 | use_dim=[0, 1, 2, 3, 4], 31 | ), 32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), 33 | dict( 34 | type='ObjectSample', 35 | db_sampler=dict( 36 | data_root='data/nuscenes/', 37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl', 38 | rate=1.0, 39 | prepare=dict( 40 | filter_by_difficulty=[-1], 41 | filter_by_min_points=dict( 42 | car=5, 43 | truck=5, 44 | bus=5, 45 | trailer=5, 46 | construction_vehicle=5, 47 | traffic_cone=5, 48 | barrier=5, 49 | motorcycle=5, 50 | bicycle=5, 51 | pedestrian=5)), 52 | classes=class_names, 53 | sample_groups=dict( 54 | car=2, 55 | truck=3, 56 | construction_vehicle=7, 57 | bus=4, 58 | trailer=6, 59 | barrier=2, 60 | motorcycle=6, 61 | bicycle=6, 62 | pedestrian=2, 63 | traffic_cone=2), 64 | points_loader=dict( 65 | type='LoadPointsFromFile', 66 | coord_type='LIDAR', 67 | load_dim=5, 68 | use_dim=[0, 1, 2, 3, 4], 69 | ))), 70 | dict( 71 | type='GlobalRotScaleTrans', 72 | rot_range=[-0.3925 * 2, 0.3925 * 2], 73 | scale_ratio_range=[0.9, 1.1], 74 | translation_std=[0.5, 0.5, 0.5]), 75 | dict( 76 | type='RandomFlip3D', 77 | sync_2d=False, 78 | flip_ratio_bev_horizontal=0.5, 79 | flip_ratio_bev_vertical=0.5), 80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 82 | dict(type='ObjectNameFilter', classes=class_names), 83 | dict(type='PointShuffle'), 84 | dict(type='DefaultFormatBundle3D', class_names=class_names), 85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'], 86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 87 | 'depth2img', 'cam2img', 'pad_shape', 88 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 92 | 'transformation_3d_flow', 'rot_degree', 93 | 'gt_bboxes_3d', 'gt_labels_3d')) 94 | ] 95 | test_pipeline = [ 96 | dict( 97 | type='LoadPointsFromFile', 98 | coord_type='LIDAR', 99 | load_dim=5, 100 | use_dim=[0, 1, 2, 3, 4], 101 | ), 102 | dict( 103 | type='LoadPointsFromMultiSweeps', 104 | sweeps_num=10, 105 | use_dim=[0, 1, 2, 3, 4], 106 | ), 107 | dict( 108 | type='MultiScaleFlipAug3D', 109 | img_scale=(1333, 800), 110 | pts_scale_ratio=1, 111 | flip=False, 112 | transforms=[ 113 | dict( 114 | type='GlobalRotScaleTrans', 115 | rot_range=[0, 0], 116 | scale_ratio_range=[1.0, 1.0], 117 | translation_std=[0, 0, 0]), 118 | dict(type='RandomFlip3D'), 119 | dict( 120 | type='DefaultFormatBundle3D', 121 | class_names=class_names, 122 | with_label=False), 123 | dict(type='Collect3D', keys=['points']) 124 | ]) 125 | ] 126 | data = dict( 127 | samples_per_gpu=2, 128 | workers_per_gpu=4, 129 | train=dict( 130 | type='CBGSDataset', 131 | dataset=dict( 132 | type=dataset_type, 133 | data_root=data_root, 134 | ann_file=data_root + '/nuscenes_infos_train.pkl', 135 | load_interval=1, 136 | pipeline=train_pipeline, 137 | classes=class_names, 138 | modality=input_modality, 139 | test_mode=False, 140 | box_type_3d='LiDAR')), 141 | val=dict( 142 | type=dataset_type, 143 | data_root=data_root, 144 | ann_file=data_root + '/nuscenes_infos_val.pkl', 145 | load_interval=1, 146 | pipeline=test_pipeline, 147 | classes=class_names, 148 | modality=input_modality, 149 | test_mode=True, 150 | box_type_3d='LiDAR'), 151 | test=dict( 152 | type=dataset_type, 153 | data_root=data_root, 154 | ann_file=data_root + '/nuscenes_infos_val.pkl', 155 | load_interval=1, 156 | pipeline=test_pipeline, 157 | classes=class_names, 158 | modality=input_modality, 159 | test_mode=True, 160 | box_type_3d='LiDAR')) 161 | model = dict( 162 | type='FSTRDetector', 163 | pts_voxel_layer=dict( 164 | num_point_features=5, 165 | max_num_points=10, 166 | voxel_size=voxel_size, 167 | max_voxels=(120000, 160000), 168 | point_cloud_range=point_cloud_range), 169 | pts_voxel_encoder=dict( 170 | type='HardSimpleVFE', 171 | num_features=5, 172 | ), 173 | pts_middle_encoder=dict( 174 | type='VoxelNextEncoder', 175 | in_channels=5, 176 | sparse_shape=[41, 1440, 1440], 177 | base_channels=32, 178 | output_channels=256, 179 | encoder_channels=((32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256),(256, 256, 256),(256, 256, 256)), 180 | sparse_conv_kernel = (5, 3, 3, 3, 3), 181 | order=('conv', 'norm', 'act'), 182 | block_type='basicblock'), 183 | 184 | pts_bbox_head=dict( 185 | type='FSTRHead', 186 | in_channels=256, 187 | hidden_dim=256, 188 | downsample_scale=8, 189 | num_query=500, 190 | num_init_query=200, 191 | init_dn_query = False, 192 | init_learnable_query = False, 193 | init_query_topk = 1, 194 | init_query_radius = 1, 195 | gauusian_dn_sampling=False, 196 | noise_mean = 0.5, 197 | noise_std = 0.125, 198 | max_sparse_token_per_sample = 10000, 199 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), 200 | tasks=[ 201 | dict(num_class=10, class_names=[ 202 | 'car', 'truck', 'construction_vehicle', 203 | 'bus', 'trailer', 'barrier', 204 | 'motorcycle', 'bicycle', 205 | 'pedestrian', 'traffic_cone' 206 | ]), 207 | ], 208 | bbox_coder=dict( 209 | type='MultiTaskBBoxCoder', 210 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 211 | pc_range=point_cloud_range, 212 | max_num=300, 213 | voxel_size=voxel_size, 214 | num_classes=10), 215 | separate_head=dict( 216 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3), 217 | transformer=dict( 218 | type='FSTRTransformer', 219 | decoder=dict( 220 | type='PETRTransformerDecoder', 221 | return_intermediate=True, 222 | num_layers=1, 223 | transformerlayers=dict( 224 | type='PETRTransformerDecoderLayer', 225 | attn_cfgs=[ 226 | dict( 227 | type='MultiheadAttention', 228 | embed_dims=256, 229 | num_heads=8, 230 | dropout=0.1), 231 | dict( 232 | type='PETRMultiheadFlashAttention', 233 | embed_dims=256, 234 | num_heads=8, 235 | dropout=0.1), 236 | ], 237 | ffn_cfgs=dict( 238 | type='FFN', 239 | embed_dims=256, 240 | feedforward_channels=1024, 241 | num_fcs=2, 242 | ffn_drop=0., 243 | act_cfg=dict(type='ReLU', inplace=True), 244 | ), 245 | 246 | feedforward_channels=1024, #unused 247 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 248 | 'ffn', 'norm')), 249 | )), 250 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0), 251 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), 252 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), 253 | ), 254 | train_cfg=dict( 255 | pts=dict( 256 | dataset='nuScenes', 257 | assigner=dict( 258 | type='HungarianAssigner3D', 259 | cls_cost=dict(type='FocalLossCost', weight=2.0), 260 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 261 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 262 | pc_range=point_cloud_range, 263 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 264 | ), 265 | pos_weight=-1, 266 | gaussian_overlap=0.1, 267 | min_radius=2, 268 | grid_size=[1440, 1440, 40], # [x_len, y_len, 1] 269 | voxel_size=voxel_size, 270 | out_size_factor=out_size_factor, 271 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 272 | point_cloud_range=point_cloud_range)), 273 | test_cfg=dict( 274 | pts=dict( 275 | dataset='nuScenes', 276 | grid_size=[1440, 1440, 40], 277 | out_size_factor=out_size_factor, 278 | pc_range=point_cloud_range[0:2], 279 | voxel_size=voxel_size[:2], 280 | nms_type=None, 281 | nms_thr=0.1, 282 | use_rotate_nms=True, 283 | max_num=300 284 | ))) 285 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu 286 | optimizer_config = dict( 287 | type='CustomFp16OptimizerHook', 288 | loss_scale=512., 289 | grad_clip=dict(max_norm=35, norm_type=2), 290 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False)) 291 | lr_config = dict( 292 | policy='cyclic', 293 | target_ratio=(8, 0.0001), 294 | cyclic_times=1, 295 | step_ratio_up=0.4) 296 | momentum_config = dict( 297 | policy='cyclic', 298 | target_ratio=(0.8947368421052632, 1), 299 | cyclic_times=1, 300 | step_ratio_up=0.4) 301 | total_epochs = 20 302 | checkpoint_config = dict(interval=1) 303 | evaluation = dict(interval=5, pipeline=test_pipeline) 304 | log_config = dict( 305 | interval=50, 306 | hooks=[dict(type='TextLoggerHook'), 307 | dict(type='TensorboardLoggerHook')]) 308 | dist_params = dict(backend='nccl') 309 | log_level = 'INFO' 310 | work_dir = None 311 | load_from = None 312 | resume_from = None 313 | workflow = [('train', 1)] 314 | gpu_ids = range(0, 8) 315 | 316 | -------------------------------------------------------------------------------- /projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py: -------------------------------------------------------------------------------- 1 | plugin=True 2 | plugin_dir='projects/mmdet3d_plugin/' 3 | 4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] 5 | class_names = [ 6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 8 | ] 9 | voxel_size = [0.050, 0.050, 0.2] 10 | out_size_factor = 8 11 | evaluation = dict(interval=20) 12 | dataset_type = 'CustomNuScenesDataset' 13 | data_root = 'data/nuscenes/' 14 | input_modality = dict( 15 | use_lidar=True, 16 | use_camera=False, 17 | use_radar=False, 18 | use_map=False, 19 | use_external=False) 20 | train_pipeline = [ 21 | dict( 22 | type='LoadPointsFromFile', 23 | coord_type='LIDAR', 24 | load_dim=5, 25 | use_dim=[0, 1, 2, 3, 4], 26 | ), 27 | dict( 28 | type='LoadPointsFromMultiSweeps', 29 | sweeps_num=10, 30 | use_dim=[0, 1, 2, 3, 4], 31 | ), 32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), 33 | dict( 34 | type='ObjectSample', 35 | db_sampler=dict( 36 | data_root='data/nuscenes/', 37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl', 38 | rate=1.0, 39 | prepare=dict( 40 | filter_by_difficulty=[-1], 41 | filter_by_min_points=dict( 42 | car=5, 43 | truck=5, 44 | bus=5, 45 | trailer=5, 46 | construction_vehicle=5, 47 | traffic_cone=5, 48 | barrier=5, 49 | motorcycle=5, 50 | bicycle=5, 51 | pedestrian=5)), 52 | classes=class_names, 53 | sample_groups=dict( 54 | car=2, 55 | truck=3, 56 | construction_vehicle=7, 57 | bus=4, 58 | trailer=6, 59 | barrier=2, 60 | motorcycle=6, 61 | bicycle=6, 62 | pedestrian=2, 63 | traffic_cone=2), 64 | points_loader=dict( 65 | type='LoadPointsFromFile', 66 | coord_type='LIDAR', 67 | load_dim=5, 68 | use_dim=[0, 1, 2, 3, 4], 69 | ))), 70 | dict( 71 | type='GlobalRotScaleTrans', 72 | rot_range=[-0.3925 * 2, 0.3925 * 2], 73 | scale_ratio_range=[0.9, 1.1], 74 | translation_std=[0.5, 0.5, 0.5]), 75 | dict( 76 | type='RandomFlip3D', 77 | sync_2d=False, 78 | flip_ratio_bev_horizontal=0.5, 79 | flip_ratio_bev_vertical=0.5), 80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 82 | dict(type='ObjectNameFilter', classes=class_names), 83 | dict(type='PointShuffle'), 84 | dict(type='DefaultFormatBundle3D', class_names=class_names), 85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'], 86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 87 | 'depth2img', 'cam2img', 'pad_shape', 88 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 92 | 'transformation_3d_flow', 'rot_degree', 93 | 'gt_bboxes_3d', 'gt_labels_3d')) 94 | ] 95 | test_pipeline = [ 96 | dict( 97 | type='LoadPointsFromFile', 98 | coord_type='LIDAR', 99 | load_dim=5, 100 | use_dim=[0, 1, 2, 3, 4], 101 | ), 102 | dict( 103 | type='LoadPointsFromMultiSweeps', 104 | sweeps_num=10, 105 | use_dim=[0, 1, 2, 3, 4], 106 | ), 107 | dict( 108 | type='MultiScaleFlipAug3D', 109 | img_scale=(1333, 800), 110 | pts_scale_ratio=1, 111 | flip=False, 112 | transforms=[ 113 | dict( 114 | type='GlobalRotScaleTrans', 115 | rot_range=[0, 0], 116 | scale_ratio_range=[1.0, 1.0], 117 | translation_std=[0, 0, 0]), 118 | dict(type='RandomFlip3D'), 119 | dict( 120 | type='DefaultFormatBundle3D', 121 | class_names=class_names, 122 | with_label=False), 123 | dict(type='Collect3D', keys=['points']) 124 | ]) 125 | ] 126 | data = dict( 127 | samples_per_gpu=2, 128 | workers_per_gpu=4, 129 | train=dict( 130 | type='CBGSDataset', 131 | dataset=dict( 132 | type=dataset_type, 133 | data_root=data_root, 134 | ann_file=data_root + '/nuscenes_infos_train.pkl', 135 | load_interval=1, 136 | pipeline=train_pipeline, 137 | classes=class_names, 138 | modality=input_modality, 139 | test_mode=False, 140 | box_type_3d='LiDAR')), 141 | val=dict( 142 | type=dataset_type, 143 | data_root=data_root, 144 | ann_file=data_root + '/nuscenes_infos_val.pkl', 145 | load_interval=1, 146 | pipeline=test_pipeline, 147 | classes=class_names, 148 | modality=input_modality, 149 | test_mode=True, 150 | box_type_3d='LiDAR'), 151 | test=dict( 152 | type=dataset_type, 153 | data_root=data_root, 154 | ann_file=data_root + '/nuscenes_infos_val.pkl', 155 | load_interval=1, 156 | pipeline=test_pipeline, 157 | classes=class_names, 158 | modality=input_modality, 159 | test_mode=True, 160 | box_type_3d='LiDAR')) 161 | model = dict( 162 | type='FSTRDetector', 163 | pts_voxel_layer=dict( 164 | num_point_features=5, 165 | max_num_points=10, 166 | voxel_size=voxel_size, 167 | max_voxels=(120000, 160000), 168 | point_cloud_range=point_cloud_range), 169 | pts_voxel_encoder=dict( 170 | type='HardSimpleVFE', 171 | num_features=5, 172 | ), 173 | pts_middle_encoder=dict( 174 | type='VoxelNextEncoder', 175 | in_channels=5, 176 | sparse_shape=[41, 2160, 2160], 177 | base_channels=32, 178 | output_channels=256, 179 | encoder_channels=((32, 32), (64, 64, 64, 64), (128, 128, 128, 128, 128), (256, 256, 256, 256, 256, 256, 256),(256, 256, 256, 256),(256, 256, 256, 256)), 180 | encoder_paddings=((1, 1 ), (1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1, 1),(1, 1, 1, 1),(1, 1, 1, 1)), 181 | sparse_conv_kernel = (5, 3, 3, 3, 3), 182 | order=('conv', 'norm', 'act'), 183 | block_type='basicblock'), 184 | 185 | pts_bbox_head=dict( 186 | type='FSTRHead', 187 | in_channels=256, 188 | hidden_dim=256, 189 | downsample_scale=8, 190 | num_query=500, 191 | num_init_query=200, 192 | init_dn_query = False, 193 | init_learnable_query = False, 194 | init_query_topk = 1, 195 | init_query_radius = 1, 196 | gauusian_dn_sampling=False, 197 | noise_mean = 0.5, 198 | noise_std = 0.125, 199 | max_sparse_token_per_sample = 10000, 200 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), 201 | tasks=[ 202 | dict(num_class=10, class_names=[ 203 | 'car', 'truck', 'construction_vehicle', 204 | 'bus', 'trailer', 'barrier', 205 | 'motorcycle', 'bicycle', 206 | 'pedestrian', 'traffic_cone' 207 | ]), 208 | ], 209 | bbox_coder=dict( 210 | type='MultiTaskBBoxCoder', 211 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 212 | pc_range=point_cloud_range, 213 | max_num=300, 214 | voxel_size=voxel_size, 215 | num_classes=10), 216 | separate_head=dict( 217 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3), 218 | transformer=dict( 219 | type='FSTRTransformer', 220 | decoder=dict( 221 | type='PETRTransformerDecoder', 222 | return_intermediate=True, 223 | num_layers=1, 224 | transformerlayers=dict( 225 | type='PETRTransformerDecoderLayer', 226 | attn_cfgs=[ 227 | dict( 228 | type='MultiheadAttention', 229 | embed_dims=256, 230 | num_heads=8, 231 | dropout=0.1), 232 | dict( 233 | type='PETRMultiheadFlashAttention', 234 | embed_dims=256, 235 | num_heads=8, 236 | dropout=0.1), 237 | ], 238 | ffn_cfgs=dict( 239 | type='FFN', 240 | embed_dims=256, 241 | feedforward_channels=1024, 242 | num_fcs=2, 243 | ffn_drop=0., 244 | act_cfg=dict(type='ReLU', inplace=True), 245 | ), 246 | 247 | feedforward_channels=1024, #unused 248 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 249 | 'ffn', 'norm')), 250 | )), 251 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0), 252 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), 253 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), 254 | ), 255 | train_cfg=dict( 256 | pts=dict( 257 | dataset='nuScenes', 258 | assigner=dict( 259 | type='HungarianAssigner3D', 260 | cls_cost=dict(type='FocalLossCost', weight=2.0), 261 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 262 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 263 | pc_range=point_cloud_range, 264 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 265 | ), 266 | pos_weight=-1, 267 | gaussian_overlap=0.1, 268 | min_radius=2, 269 | grid_size=[2160, 2160, 40], # [x_len, y_len, 1] 270 | voxel_size=voxel_size, 271 | out_size_factor=out_size_factor, 272 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], 273 | point_cloud_range=point_cloud_range)), 274 | test_cfg=dict( 275 | pts=dict( 276 | dataset='nuScenes', 277 | grid_size=[2160, 2160, 40], 278 | out_size_factor=out_size_factor, 279 | pc_range=point_cloud_range[0:2], 280 | voxel_size=voxel_size[:2], 281 | nms_type=None, 282 | nms_thr=0.1, 283 | use_rotate_nms=True, 284 | max_num=300 285 | ))) 286 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu 287 | optimizer_config = dict( 288 | type='CustomFp16OptimizerHook', 289 | loss_scale=512., 290 | grad_clip=dict(max_norm=35, norm_type=2), 291 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False)) 292 | lr_config = dict( 293 | policy='cyclic', 294 | target_ratio=(8, 0.0001), 295 | cyclic_times=1, 296 | step_ratio_up=0.4) 297 | momentum_config = dict( 298 | policy='cyclic', 299 | target_ratio=(0.8947368421052632, 1), 300 | cyclic_times=1, 301 | step_ratio_up=0.4) 302 | total_epochs = 20 303 | checkpoint_config = dict(interval=1) 304 | evaluation = dict(interval=5, pipeline=test_pipeline) 305 | log_config = dict( 306 | interval=50, 307 | hooks=[dict(type='TextLoggerHook'), 308 | dict(type='TensorboardLoggerHook')]) 309 | dist_params = dict(backend='nccl') 310 | log_level = 'INFO' 311 | work_dir = None 312 | load_from = None 313 | resume_from = None 314 | workflow = [('train', 1)] 315 | gpu_ids = range(0, 8) 316 | 317 | -------------------------------------------------------------------------------- /tools/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from __future__ import division 3 | import argparse 4 | import copy 5 | import os 6 | import time 7 | import warnings 8 | from os import path as osp 9 | 10 | import mmcv 11 | import torch 12 | import torch.distributed as dist 13 | from mmcv import Config, DictAction 14 | from mmcv.runner import get_dist_info, init_dist 15 | 16 | from mmdet import __version__ as mmdet_version 17 | from mmdet3d import __version__ as mmdet3d_version 18 | from mmdet3d.apis import init_random_seed, train_model 19 | from mmdet3d.datasets import build_dataset 20 | from mmdet3d.models import build_model 21 | from mmdet3d.utils import collect_env, get_root_logger 22 | from mmdet.apis import set_random_seed 23 | from mmseg import __version__ as mmseg_version 24 | 25 | try: 26 | # If mmdet version > 2.20.0, setup_multi_processes would be imported and 27 | # used from mmdet instead of mmdet3d. 28 | from mmdet.utils import setup_multi_processes 29 | except ImportError: 30 | from mmdet3d.utils import setup_multi_processes 31 | 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser(description='Train a detector') 35 | parser.add_argument('config', help='train config file path') 36 | parser.add_argument('--work-dir', help='the dir to save logs and models') 37 | parser.add_argument( 38 | '--resume-from', help='the checkpoint file to resume from') 39 | parser.add_argument( 40 | '--auto-resume', 41 | action='store_true', 42 | help='resume from the latest checkpoint automatically') 43 | parser.add_argument( 44 | '--no-validate', 45 | action='store_true', 46 | help='whether not to evaluate the checkpoint during training') 47 | group_gpus = parser.add_mutually_exclusive_group() 48 | group_gpus.add_argument( 49 | '--gpus', 50 | type=int, 51 | help='(Deprecated, please use --gpu-id) number of gpus to use ' 52 | '(only applicable to non-distributed training)') 53 | group_gpus.add_argument( 54 | '--gpu-ids', 55 | type=int, 56 | nargs='+', 57 | help='(Deprecated, please use --gpu-id) ids of gpus to use ' 58 | '(only applicable to non-distributed training)') 59 | group_gpus.add_argument( 60 | '--gpu-id', 61 | type=int, 62 | default=0, 63 | help='number of gpus to use ' 64 | '(only applicable to non-distributed training)') 65 | parser.add_argument('--seed', type=int, default=0, help='random seed') 66 | parser.add_argument( 67 | '--diff-seed', 68 | action='store_true', 69 | help='Whether or not set different seeds for different ranks') 70 | parser.add_argument( 71 | '--deterministic', 72 | action='store_true', 73 | help='whether to set deterministic options for CUDNN backend.') 74 | parser.add_argument( 75 | '--options', 76 | nargs='+', 77 | action=DictAction, 78 | help='override some settings in the used config, the key-value pair ' 79 | 'in xxx=yyy format will be merged into config file (deprecate), ' 80 | 'change to --cfg-options instead.') 81 | parser.add_argument( 82 | '--cfg-options', 83 | nargs='+', 84 | action=DictAction, 85 | help='override some settings in the used config, the key-value pair ' 86 | 'in xxx=yyy format will be merged into config file. If the value to ' 87 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 88 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 89 | 'Note that the quotation marks are necessary and that no white space ' 90 | 'is allowed.') 91 | parser.add_argument( 92 | '--launcher', 93 | choices=['none', 'pytorch', 'slurm', 'mpi'], 94 | default='none', 95 | help='job launcher') 96 | parser.add_argument('--local_rank', type=int, default=0) 97 | parser.add_argument( 98 | '--autoscale-lr', 99 | action='store_true', 100 | help='automatically scale lr with the number of gpus') 101 | args = parser.parse_args() 102 | if 'LOCAL_RANK' not in os.environ: 103 | os.environ['LOCAL_RANK'] = str(args.local_rank) 104 | 105 | if args.options and args.cfg_options: 106 | raise ValueError( 107 | '--options and --cfg-options cannot be both specified, ' 108 | '--options is deprecated in favor of --cfg-options') 109 | if args.options: 110 | warnings.warn('--options is deprecated in favor of --cfg-options') 111 | args.cfg_options = args.options 112 | 113 | return args 114 | 115 | 116 | def main(): 117 | args = parse_args() 118 | 119 | cfg = Config.fromfile(args.config) 120 | if args.cfg_options is not None: 121 | cfg.merge_from_dict(args.cfg_options) 122 | 123 | # set multi-process settings 124 | setup_multi_processes(cfg) 125 | 126 | if cfg.get('custom_imports', None): 127 | from mmcv.utils import import_modules_from_strings 128 | import_modules_from_strings(**cfg['custom_imports']) 129 | 130 | # import modules from plguin/xx, registry will be updated 131 | if hasattr(cfg, 'plugin'): 132 | if cfg.plugin: 133 | import importlib 134 | if hasattr(cfg, 'plugin_dir'): 135 | plugin_dir = cfg.plugin_dir 136 | _module_dir = os.path.dirname(plugin_dir) 137 | _module_dir = _module_dir.split('/') 138 | _module_path = _module_dir[0] 139 | 140 | for m in _module_dir[1:]: 141 | _module_path = _module_path + '.' + m 142 | print(_module_path) 143 | plg_lib = importlib.import_module(_module_path) 144 | else: 145 | # import dir is the dirpath for the config file 146 | _module_dir = os.path.dirname(args.config) 147 | _module_dir = _module_dir.split('/') 148 | _module_path = _module_dir[0] 149 | for m in _module_dir[1:]: 150 | _module_path = _module_path + '.' + m 151 | print(_module_path) 152 | plg_lib = importlib.import_module(_module_path) 153 | 154 | plg_lib = importlib.import_module('mmdetection3d.mmdet3d') 155 | 156 | # set cudnn_benchmark 157 | if cfg.get('cudnn_benchmark', False): 158 | torch.backends.cudnn.benchmark = True 159 | 160 | # work_dir is determined in this priority: CLI > segment in file > filename 161 | if args.work_dir is not None: 162 | # update configs according to CLI args if args.work_dir is not None 163 | cfg.work_dir = args.work_dir 164 | elif cfg.get('work_dir', None) is None: 165 | # use config filename as default work_dir if cfg.work_dir is None 166 | cfg.work_dir = osp.join('./work_dirs', 167 | osp.splitext(osp.basename(args.config))[0]) 168 | if args.resume_from is not None: 169 | cfg.resume_from = args.resume_from 170 | 171 | if args.auto_resume: 172 | cfg.auto_resume = args.auto_resume 173 | warnings.warn('`--auto-resume` is only supported when mmdet' 174 | 'version >= 2.20.0 for 3D detection model or' 175 | 'mmsegmentation verision >= 0.21.0 for 3D' 176 | 'segmentation model') 177 | 178 | if args.gpus is not None: 179 | cfg.gpu_ids = range(1) 180 | warnings.warn('`--gpus` is deprecated because we only support ' 181 | 'single GPU mode in non-distributed training. ' 182 | 'Use `gpus=1` now.') 183 | if args.gpu_ids is not None: 184 | cfg.gpu_ids = args.gpu_ids[0:1] 185 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 186 | 'Because we only support single GPU mode in ' 187 | 'non-distributed training. Use the first GPU ' 188 | 'in `gpu_ids` now.') 189 | if args.gpus is None and args.gpu_ids is None: 190 | cfg.gpu_ids = [args.gpu_id] 191 | 192 | if args.autoscale_lr: 193 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) 194 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 195 | 196 | # init distributed env first, since logger depends on the dist info. 197 | if args.launcher == 'none': 198 | distributed = False 199 | else: 200 | distributed = True 201 | init_dist(args.launcher, **cfg.dist_params) 202 | # re-set gpu_ids with distributed training mode 203 | _, world_size = get_dist_info() 204 | cfg.gpu_ids = range(world_size) 205 | 206 | # create work_dir 207 | mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) 208 | # dump config 209 | cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) 210 | # init the logger before other steps 211 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 212 | log_file = osp.join(cfg.work_dir, f'{timestamp}.log') 213 | # specify logger name, if we still use 'mmdet', the output info will be 214 | # filtered and won't be saved in the log_file 215 | # TODO: ugly workaround to judge whether we are training det or seg model 216 | if cfg.model.type in ['EncoderDecoder3D']: 217 | logger_name = 'mmseg' 218 | else: 219 | logger_name = 'mmdet' 220 | logger = get_root_logger( 221 | log_file=log_file, log_level=cfg.log_level, name=logger_name) 222 | 223 | # init the meta dict to record some important information such as 224 | # environment info and seed, which will be logged 225 | meta = dict() 226 | # log env info 227 | env_info_dict = collect_env() 228 | env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) 229 | dash_line = '-' * 60 + '\n' 230 | logger.info('Environment info:\n' + dash_line + env_info + '\n' + 231 | dash_line) 232 | meta['env_info'] = env_info 233 | meta['config'] = cfg.pretty_text 234 | 235 | # log some basic info 236 | logger.info(f'Distributed training: {distributed}') 237 | logger.info(f'Config:\n{cfg.pretty_text}') 238 | 239 | # set random seeds 240 | seed = init_random_seed(args.seed) 241 | seed = seed + dist.get_rank() if args.diff_seed else seed 242 | logger.info(f'Set random seed to {seed}, ' 243 | f'deterministic: {args.deterministic}') 244 | set_random_seed(seed, deterministic=args.deterministic) 245 | cfg.seed = seed 246 | meta['seed'] = seed 247 | meta['exp_name'] = osp.basename(args.config) 248 | 249 | model = build_model( 250 | cfg.model, 251 | train_cfg=cfg.get('train_cfg'), 252 | test_cfg=cfg.get('test_cfg')) 253 | model.init_weights() 254 | 255 | logger.info(f'Model:\n{model}') 256 | datasets = [build_dataset(cfg.data.train)] 257 | if len(cfg.workflow) == 2: 258 | val_dataset = copy.deepcopy(cfg.data.val) 259 | # in case we use a dataset wrapper 260 | if 'dataset' in cfg.data.train: 261 | val_dataset.pipeline = cfg.data.train.dataset.pipeline 262 | else: 263 | val_dataset.pipeline = cfg.data.train.pipeline 264 | # set test_mode=False here in deep copied config 265 | # which do not affect AP/AR calc ulation later 266 | # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa 267 | val_dataset.test_mode = False 268 | datasets.append(build_dataset(val_dataset)) 269 | if cfg.checkpoint_config is not None: 270 | # save mmdet version, config file content and class names in 271 | # checkpoints as meta data 272 | cfg.checkpoint_config.meta = dict( 273 | mmdet_version=mmdet_version, 274 | mmseg_version=mmseg_version, 275 | mmdet3d_version=mmdet3d_version, 276 | config=cfg.pretty_text, 277 | CLASSES=datasets[0].CLASSES, 278 | PALETTE=datasets[0].PALETTE # for segmentors 279 | if hasattr(datasets[0], 'PALETTE') else None) 280 | # add an attribute for visualization convenience 281 | model.CLASSES = datasets[0].CLASSES 282 | train_model( 283 | model, 284 | datasets, 285 | cfg, 286 | distributed=distributed, 287 | validate=(not args.no_validate), 288 | timestamp=timestamp, 289 | meta=meta) 290 | 291 | 292 | if __name__ == '__main__': 293 | main() -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/cmt_transformer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d) 5 | # Copyright (c) 2021 Wang, Yue 6 | # ------------------------------------------------------------------------ 7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) 8 | # Copyright (c) OpenMMLab. All rights reserved. 9 | # ------------------------------------------------------------------------ 10 | 11 | import math 12 | import copy 13 | import warnings 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | import torch.utils.checkpoint as cp 18 | 19 | from typing import Sequence 20 | from einops import rearrange 21 | from mmcv.cnn.bricks.drop import build_dropout 22 | from mmcv.runner.base_module import BaseModule 23 | from mmcv.cnn.bricks.transformer import ( 24 | BaseTransformerLayer, 25 | TransformerLayerSequence, 26 | build_transformer_layer_sequence 27 | ) 28 | from mmcv.cnn import ( 29 | build_activation_layer, 30 | build_conv_layer, 31 | build_norm_layer, 32 | xavier_init 33 | ) 34 | from mmcv.cnn.bricks.registry import ( 35 | ATTENTION, 36 | TRANSFORMER_LAYER, 37 | TRANSFORMER_LAYER_SEQUENCE 38 | ) 39 | from mmcv.utils import ( 40 | ConfigDict, 41 | build_from_cfg, 42 | deprecated_api_warning, 43 | to_2tuple 44 | ) 45 | from mmdet.models.utils.builder import TRANSFORMER 46 | 47 | 48 | @TRANSFORMER.register_module() 49 | class CmtTransformer(BaseModule): 50 | """Implements the DETR transformer. 51 | Following the official DETR implementation, this module copy-paste 52 | from torch.nn.Transformer with modifications: 53 | * positional encodings are passed in MultiheadAttention 54 | * extra LN at the end of encoder is removed 55 | * decoder returns a stack of activations from all decoding layers 56 | See `paper: End-to-End Object Detection with Transformers 57 | `_ for details. 58 | Args: 59 | encoder (`mmcv.ConfigDict` | Dict): Config of 60 | TransformerEncoder. Defaults to None. 61 | decoder ((`mmcv.ConfigDict` | Dict)): Config of 62 | TransformerDecoder. Defaults to None 63 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. 64 | Defaults to None. 65 | """ 66 | 67 | def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False): 68 | super(CmtTransformer, self).__init__(init_cfg=init_cfg) 69 | if encoder is not None: 70 | self.encoder = build_transformer_layer_sequence(encoder) 71 | else: 72 | self.encoder = None 73 | self.decoder = build_transformer_layer_sequence(decoder) 74 | self.embed_dims = self.decoder.embed_dims 75 | self.cross = cross 76 | 77 | def init_weights(self): 78 | # follow the official DETR to init parameters 79 | for m in self.modules(): 80 | if hasattr(m, 'weight') and m.weight.dim() > 1: 81 | xavier_init(m, distribution='uniform') 82 | self._is_init = True 83 | 84 | def forward(self, x, x_img, query_embed, bev_pos_embed, rv_pos_embed, attn_masks=None, reg_branch=None): 85 | """Forward function for `Transformer`. 86 | Args: 87 | x (Tensor): Input query with shape [bs, c, h, w] where 88 | c = embed_dims. 89 | mask (Tensor): The key_padding_mask used for encoder and decoder, 90 | with shape [bs, h, w]. 91 | query_embed (Tensor): The query embedding for decoder, with shape 92 | [num_query, c]. 93 | pos_embed (Tensor): The positional encoding for encoder and 94 | decoder, with the same shape as `x`. 95 | Returns: 96 | tuple[Tensor]: results of decoder containing the following tensor. 97 | - out_dec: Output from decoder. If return_intermediate_dec \ 98 | is True output has shape [num_dec_layers, bs, 99 | num_query, embed_dims], else has shape [1, bs, \ 100 | num_query, embed_dims]. 101 | - memory: Output results from encoder, with shape \ 102 | [bs, embed_dims, h, w]. 103 | """ 104 | bs, c, h, w = x.shape 105 | bev_memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c] 106 | rv_memory = rearrange(x_img, "(bs v) c h w -> (v h w) bs c", bs=bs) 107 | bev_pos_embed = bev_pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c] 108 | rv_pos_embed = rearrange(rv_pos_embed, "(bs v) h w c -> (v h w) bs c", bs=bs) 109 | 110 | memory, pos_embed = torch.cat([bev_memory, rv_memory], dim=0), torch.cat([bev_pos_embed, rv_pos_embed], dim=0) 111 | query_embed = query_embed.transpose(0, 1) # [num_query, dim] -> [num_query, bs, dim] 112 | mask = memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w] 113 | 114 | target = torch.zeros_like(query_embed) 115 | # out_dec: [num_layers, num_query, bs, dim] 116 | out_dec = self.decoder( 117 | query=target, 118 | key=memory, 119 | value=memory, 120 | key_pos=pos_embed, 121 | query_pos=query_embed, 122 | key_padding_mask=mask, 123 | attn_masks=[attn_masks, None], 124 | reg_branch=reg_branch, 125 | ) 126 | out_dec = out_dec.transpose(1, 2) 127 | return out_dec, memory 128 | 129 | 130 | @TRANSFORMER.register_module() 131 | class CmtLidarTransformer(BaseModule): 132 | """Implements the DETR transformer. 133 | Following the official DETR implementation, this module copy-paste 134 | from torch.nn.Transformer with modifications: 135 | * positional encodings are passed in MultiheadAttention 136 | * extra LN at the end of encoder is removed 137 | * decoder returns a stack of activations from all decoding layers 138 | See `paper: End-to-End Object Detection with Transformers 139 | `_ for details. 140 | Args: 141 | encoder (`mmcv.ConfigDict` | Dict): Config of 142 | TransformerEncoder. Defaults to None. 143 | decoder ((`mmcv.ConfigDict` | Dict)): Config of 144 | TransformerDecoder. Defaults to None 145 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. 146 | Defaults to None. 147 | """ 148 | 149 | def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False): 150 | super(CmtLidarTransformer, self).__init__(init_cfg=init_cfg) 151 | if encoder is not None: 152 | self.encoder = build_transformer_layer_sequence(encoder) 153 | else: 154 | self.encoder = None 155 | self.decoder = build_transformer_layer_sequence(decoder) 156 | self.embed_dims = self.decoder.embed_dims 157 | self.cross = cross 158 | 159 | def init_weights(self): 160 | # follow the official DETR to init parameters 161 | for m in self.modules(): 162 | if hasattr(m, 'weight') and m.weight.dim() > 1: 163 | xavier_init(m, distribution='uniform') 164 | self._is_init = True 165 | 166 | def forward(self, x, mask, query_embed, pos_embed, attn_masks=None, reg_branch=None): 167 | """Forward function for `Transformer`. 168 | Args: 169 | x (Tensor): Input query with shape [bs, c, h, w] where 170 | c = embed_dims. 171 | mask (Tensor): The key_padding_mask used for encoder and decoder, 172 | with shape [bs, h, w]. 173 | query_embed (Tensor): The query embedding for decoder, with shape 174 | [num_query, c]. 175 | pos_embed (Tensor): The positional encoding for encoder and 176 | decoder, with the same shape as `x`. 177 | Returns: 178 | tuple[Tensor]: results of decoder containing the following tensor. 179 | - out_dec: Output from decoder. If return_intermediate_dec \ 180 | is True output has shape [num_dec_layers, bs, 181 | num_query, embed_dims], else has shape [1, bs, \ 182 | num_query, embed_dims]. 183 | - memory: Output results from encoder, with shape \ 184 | [bs, embed_dims, h, w]. 185 | """ 186 | bs, c, h, w = x.shape 187 | memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c] 188 | pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c] 189 | query_embed = query_embed.transpose(0, 1) # [num_query, dim] -> [num_query, bs, dim] 190 | mask = mask.view(bs, -1) # [bs, n, h, w] -> [bs, n*h*w] 191 | target = torch.zeros_like(query_embed) 192 | # out_dec: [num_layers, num_query, bs, dim] 193 | out_dec = self.decoder( 194 | query=target, 195 | key=memory, 196 | value=memory, 197 | key_pos=pos_embed, 198 | query_pos=query_embed, 199 | key_padding_mask=mask, 200 | attn_masks=[attn_masks, None], 201 | reg_branch=reg_branch, 202 | ) 203 | out_dec = out_dec.transpose(1, 2) 204 | return out_dec, memory 205 | 206 | 207 | 208 | @TRANSFORMER.register_module() 209 | class FSTRTransformer(CmtLidarTransformer): 210 | """Implements the DETR transformer. 211 | Following the official DETR implementation, this module copy-paste 212 | from torch.nn.Transformer with modifications: 213 | * positional encodings are passed in MultiheadAttention 214 | * extra LN at the end of encoder is removed 215 | * decoder returns a stack of activations from all decoding layers 216 | See `paper: End-to-End Object Detection with Transformers 217 | `_ for details. 218 | Args: 219 | encoder (`mmcv.ConfigDict` | Dict): Config of 220 | TransformerEncoder. Defaults to None. 221 | decoder ((`mmcv.ConfigDict` | Dict)): Config of 222 | TransformerDecoder. Defaults to None 223 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. 224 | Defaults to None. 225 | """ 226 | 227 | def __init__(self, **kwargs): 228 | super(FSTRTransformer, self).__init__(**kwargs) 229 | 230 | def forward(self, x, query_embed, bev_pos_embed, attn_masks=None, bev_key_padding_mask=None, reg_branch=None, target = None): 231 | """Forward function for `Transformer`. 232 | Args: 233 | x (Tensor): Input query with shape [bs, c, h, w] where 234 | c = embed_dims. 235 | mask (Tensor): The key_padding_mask used for encoder and decoder, 236 | with shape [bs, h, w]. 237 | query_embed (Tensor): The query embedding for decoder, with shape 238 | [num_query, c]. 239 | pos_embed (Tensor): The positional encoding for encoder and 240 | decoder, with the same shape as `x`. 241 | Returns: 242 | tuple[Tensor]: results of decoder containing the following tensor. 243 | - out_dec: Output from decoder. If return_intermediate_dec \ 244 | is True output has shape [num_dec_layers, bs, 245 | num_query, embed_dims], else has shape [1, bs, \ 246 | num_query, embed_dims]. 247 | - memory: Output results from encoder, with shape \ 248 | [bs, embed_dims, h, w]. 249 | """ 250 | bs, n, c = x.shape 251 | bev_memory = rearrange(x, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c] 252 | bev_pos_embed = rearrange(bev_pos_embed, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c] 253 | 254 | memory, pos_embed = bev_memory, bev_pos_embed 255 | query_embed = query_embed.transpose(0, 1) # [bs, num_query, dim] -> [num_query, bs, dim] 256 | 257 | if bev_key_padding_mask is None: 258 | mask = memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w] 259 | else: 260 | mask = bev_key_padding_mask 261 | 262 | assert target is not None 263 | out_dec = self.decoder( 264 | query=target, 265 | key=memory, 266 | value=memory, 267 | key_pos=pos_embed, 268 | query_pos=query_embed, 269 | key_padding_mask=mask, 270 | attn_masks=[attn_masks, None], 271 | reg_branch=reg_branch, 272 | ) 273 | out_dec = out_dec.transpose(1, 2) 274 | return out_dec, memory -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/petr_transformer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import copy 3 | import warnings 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.utils.checkpoint as cp 8 | 9 | from einops import rearrange 10 | from mmcv.cnn.bricks.drop import build_dropout 11 | from mmcv.runner.base_module import BaseModule 12 | 13 | from mmcv.cnn.bricks.transformer import ( 14 | BaseTransformerLayer, 15 | TransformerLayerSequence, 16 | build_transformer_layer_sequence 17 | ) 18 | from mmcv.cnn import ( 19 | build_activation_layer, 20 | build_conv_layer, 21 | build_norm_layer, 22 | xavier_init 23 | ) 24 | from mmcv.cnn.bricks.registry import ( 25 | ATTENTION,TRANSFORMER_LAYER, 26 | TRANSFORMER_LAYER_SEQUENCE 27 | ) 28 | from mmcv.utils import ( 29 | ConfigDict, 30 | build_from_cfg, 31 | deprecated_api_warning, 32 | to_2tuple 33 | ) 34 | from mmdet.models.utils.builder import TRANSFORMER 35 | 36 | 37 | @ATTENTION.register_module() 38 | class PETRMultiheadAttention(BaseModule): 39 | """A wrapper for ``torch.nn.MultiheadAttention``. 40 | This module implements MultiheadAttention with identity connection, 41 | and positional encoding is also passed as input. 42 | Args: 43 | embed_dims (int): The embedding dimension. 44 | num_heads (int): Parallel attention heads. 45 | attn_drop (float): A Dropout layer on attn_output_weights. 46 | Default: 0.0. 47 | proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. 48 | Default: 0.0. 49 | dropout_layer (obj:`ConfigDict`): The dropout_layer used 50 | when adding the shortcut. 51 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. 52 | Default: None. 53 | batch_first (bool): When it is True, Key, Query and Value are shape of 54 | (batch, n, embed_dim), otherwise (n, batch, embed_dim). 55 | Default to False. 56 | """ 57 | 58 | def __init__(self, 59 | embed_dims, 60 | num_heads, 61 | attn_drop=0., 62 | proj_drop=0., 63 | dropout_layer=dict(type='Dropout', drop_prob=0.), 64 | init_cfg=None, 65 | batch_first=False, 66 | **kwargs): 67 | super(PETRMultiheadAttention, self).__init__(init_cfg) 68 | if 'dropout' in kwargs: 69 | warnings.warn( 70 | 'The arguments `dropout` in MultiheadAttention ' 71 | 'has been deprecated, now you can separately ' 72 | 'set `attn_drop`(float), proj_drop(float), ' 73 | 'and `dropout_layer`(dict) ', DeprecationWarning) 74 | attn_drop = kwargs['dropout'] 75 | dropout_layer['drop_prob'] = kwargs.pop('dropout') 76 | 77 | self.embed_dims = embed_dims 78 | self.num_heads = num_heads 79 | self.batch_first = batch_first 80 | 81 | self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, 82 | **kwargs) 83 | 84 | self.proj_drop = nn.Dropout(proj_drop) 85 | self.dropout_layer = build_dropout( 86 | dropout_layer) if dropout_layer else nn.Identity() 87 | 88 | @deprecated_api_warning({'residual': 'identity'}, 89 | cls_name='MultiheadAttention') 90 | def forward(self, 91 | query, 92 | key=None, 93 | value=None, 94 | identity=None, 95 | query_pos=None, 96 | key_pos=None, 97 | attn_mask=None, 98 | key_padding_mask=None, 99 | **kwargs): 100 | """Forward function for `MultiheadAttention`. 101 | **kwargs allow passing a more general data flow when combining 102 | with other operations in `transformerlayer`. 103 | Args: 104 | query (Tensor): The input query with shape [num_queries, bs, 105 | embed_dims] if self.batch_first is False, else 106 | [bs, num_queries embed_dims]. 107 | key (Tensor): The key tensor with shape [num_keys, bs, 108 | embed_dims] if self.batch_first is False, else 109 | [bs, num_keys, embed_dims] . 110 | If None, the ``query`` will be used. Defaults to None. 111 | value (Tensor): The value tensor with same shape as `key`. 112 | Same in `nn.MultiheadAttention.forward`. Defaults to None. 113 | If None, the `key` will be used. 114 | identity (Tensor): This tensor, with the same shape as x, 115 | will be used for the identity link. 116 | If None, `x` will be used. Defaults to None. 117 | query_pos (Tensor): The positional encoding for query, with 118 | the same shape as `x`. If not None, it will 119 | be added to `x` before forward function. Defaults to None. 120 | key_pos (Tensor): The positional encoding for `key`, with the 121 | same shape as `key`. Defaults to None. If not None, it will 122 | be added to `key` before forward function. If None, and 123 | `query_pos` has the same shape as `key`, then `query_pos` 124 | will be used for `key_pos`. Defaults to None. 125 | attn_mask (Tensor): ByteTensor mask with shape [num_queries, 126 | num_keys]. Same in `nn.MultiheadAttention.forward`. 127 | Defaults to None. 128 | key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. 129 | Defaults to None. 130 | Returns: 131 | Tensor: forwarded results with shape 132 | [num_queries, bs, embed_dims] 133 | if self.batch_first is False, else 134 | [bs, num_queries embed_dims]. 135 | """ 136 | 137 | if key is None: 138 | key = query 139 | if value is None: 140 | value = key 141 | if identity is None: 142 | identity = query 143 | if key_pos is None: 144 | if query_pos is not None: 145 | # use query_pos if key_pos is not available 146 | if query_pos.shape == key.shape: 147 | key_pos = query_pos 148 | else: 149 | warnings.warn(f'position encoding of key is' 150 | f'missing in {self.__class__.__name__}.') 151 | if query_pos is not None: 152 | query = query + query_pos 153 | if key_pos is not None: 154 | key = key + key_pos 155 | 156 | # Because the dataflow('key', 'query', 'value') of 157 | # ``torch.nn.MultiheadAttention`` is (num_query, batch, 158 | # embed_dims), We should adjust the shape of dataflow from 159 | # batch_first (batch, num_query, embed_dims) to num_query_first 160 | # (num_query ,batch, embed_dims), and recover ``attn_output`` 161 | # from num_query_first to batch_first. 162 | if self.batch_first: 163 | query = query.transpose(0, 1) 164 | key = key.transpose(0, 1) 165 | value = value.transpose(0, 1) 166 | 167 | out = self.attn( 168 | query=query, 169 | key=key, 170 | value=value, 171 | attn_mask=attn_mask, 172 | key_padding_mask=key_padding_mask)[0] 173 | 174 | if self.batch_first: 175 | out = out.transpose(0, 1) 176 | 177 | return identity + self.dropout_layer(self.proj_drop(out)) 178 | 179 | 180 | from .attention import FlashMHA 181 | 182 | @ATTENTION.register_module() 183 | class PETRMultiheadFlashAttention(BaseModule): 184 | """A wrapper for ``torch.nn.MultiheadAttention``. 185 | This module implements MultiheadAttention with identity connection, 186 | and positional encoding is also passed as input. 187 | Args: 188 | embed_dims (int): The embedding dimension. 189 | num_heads (int): Parallel attention heads. 190 | attn_drop (float): A Dropout layer on attn_output_weights. 191 | Default: 0.0. 192 | proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. 193 | Default: 0.0. 194 | dropout_layer (obj:`ConfigDict`): The dropout_layer used 195 | when adding the shortcut. 196 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. 197 | Default: None. 198 | batch_first (bool): When it is True, Key, Query and Value are shape of 199 | (batch, n, embed_dim), otherwise (n, batch, embed_dim). 200 | Default to False. 201 | """ 202 | 203 | def __init__(self, 204 | embed_dims, 205 | num_heads, 206 | attn_drop=0., 207 | proj_drop=0., 208 | dropout_layer=dict(type='Dropout', drop_prob=0.), 209 | init_cfg=None, 210 | batch_first=True, 211 | **kwargs): 212 | super(PETRMultiheadFlashAttention, self).__init__(init_cfg) 213 | if 'dropout' in kwargs: 214 | warnings.warn( 215 | 'The arguments `dropout` in MultiheadAttention ' 216 | 'has been deprecated, now you can separately ' 217 | 'set `attn_drop`(float), proj_drop(float), ' 218 | 'and `dropout_layer`(dict) ', DeprecationWarning) 219 | attn_drop = kwargs['dropout'] 220 | dropout_layer['drop_prob'] = kwargs.pop('dropout') 221 | 222 | self.embed_dims = embed_dims 223 | self.num_heads = num_heads 224 | self.batch_first = True 225 | 226 | self.attn = FlashMHA(embed_dims, num_heads, attn_drop, dtype=torch.float16, device='cuda', 227 | **kwargs) 228 | 229 | self.proj_drop = nn.Dropout(proj_drop) 230 | self.dropout_layer = build_dropout( 231 | dropout_layer) if dropout_layer else nn.Identity() 232 | 233 | @deprecated_api_warning({'residual': 'identity'}, 234 | cls_name='MultiheadAttention') 235 | def forward(self, 236 | query, 237 | key=None, 238 | value=None, 239 | identity=None, 240 | query_pos=None, 241 | key_pos=None, 242 | attn_mask=None, 243 | key_padding_mask=None, 244 | **kwargs): 245 | """Forward function for `MultiheadAttention`. 246 | **kwargs allow passing a more general data flow when combining 247 | with other operations in `transformerlayer`. 248 | Args: 249 | query (Tensor): The input query with shape [num_queries, bs, 250 | embed_dims] if self.batch_first is False, else 251 | [bs, num_queries embed_dims]. 252 | key (Tensor): The key tensor with shape [num_keys, bs, 253 | embed_dims] if self.batch_first is False, else 254 | [bs, num_keys, embed_dims] . 255 | If None, the ``query`` will be used. Defaults to None. 256 | value (Tensor): The value tensor with same shape as `key`. 257 | Same in `nn.MultiheadAttention.forward`. Defaults to None. 258 | If None, the `key` will be used. 259 | identity (Tensor): This tensor, with the same shape as x, 260 | will be used for the identity link. 261 | If None, `x` will be used. Defaults to None. 262 | query_pos (Tensor): The positional encoding for query, with 263 | the same shape as `x`. If not None, it will 264 | be added to `x` before forward function. Defaults to None. 265 | key_pos (Tensor): The positional encoding for `key`, with the 266 | same shape as `key`. Defaults to None. If not None, it will 267 | be added to `key` before forward function. If None, and 268 | `query_pos` has the same shape as `key`, then `query_pos` 269 | will be used for `key_pos`. Defaults to None. 270 | attn_mask (Tensor): ByteTensor mask with shape [num_queries, 271 | num_keys]. Same in `nn.MultiheadAttention.forward`. 272 | Defaults to None. 273 | key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. 274 | Defaults to None. 275 | Returns: 276 | Tensor: forwarded results with shape 277 | [num_queries, bs, embed_dims] 278 | if self.batch_first is False, else 279 | [bs, num_queries embed_dims]. 280 | """ 281 | 282 | if key is None: 283 | key = query 284 | if value is None: 285 | value = key 286 | if identity is None: 287 | identity = query 288 | if key_pos is None: 289 | if query_pos is not None: 290 | # use query_pos if key_pos is not available 291 | if query_pos.shape == key.shape: 292 | key_pos = query_pos 293 | else: 294 | warnings.warn(f'position encoding of key is' 295 | f'missing in {self.__class__.__name__}.') 296 | if query_pos is not None: 297 | query = query + query_pos 298 | if key_pos is not None: 299 | key = key + key_pos 300 | 301 | # Because the dataflow('key', 'query', 'value') of 302 | # ``torch.nn.MultiheadAttention`` is (num_query, batch, 303 | # embed_dims), We should adjust the shape of dataflow from 304 | # batch_first (batch, num_query, embed_dims) to num_query_first 305 | # (num_query ,batch, embed_dims), and recover ``attn_output`` 306 | # from num_query_first to batch_first. 307 | if self.batch_first: 308 | query = query.transpose(0, 1) 309 | key = key.transpose(0, 1) 310 | value = value.transpose(0, 1) 311 | 312 | out = self.attn( 313 | q=query, 314 | k=key, 315 | v=value, 316 | key_padding_mask=None)[0] 317 | 318 | if self.batch_first: 319 | out = out.transpose(0, 1) 320 | 321 | return identity + self.dropout_layer(self.proj_drop(out)) 322 | 323 | 324 | @TRANSFORMER_LAYER_SEQUENCE.register_module() 325 | class PETRTransformerDecoder(TransformerLayerSequence): 326 | """Implements the decoder in DETR transformer. 327 | Args: 328 | return_intermediate (bool): Whether to return intermediate outputs. 329 | post_norm_cfg (dict): Config of last normalization layer. Default: 330 | `LN`. 331 | """ 332 | 333 | def __init__(self, 334 | *args, 335 | post_norm_cfg=dict(type='LN'), 336 | return_intermediate=False, 337 | **kwargs): 338 | 339 | super(PETRTransformerDecoder, self).__init__(*args, **kwargs) 340 | self.return_intermediate = return_intermediate 341 | if post_norm_cfg is not None: 342 | self.post_norm = build_norm_layer(post_norm_cfg, 343 | self.embed_dims)[1] 344 | else: 345 | self.post_norm = None 346 | 347 | def forward(self, query, *args, **kwargs): 348 | """Forward function for `TransformerDecoder`. 349 | Args: 350 | query (Tensor): Input query with shape 351 | `(num_query, bs, embed_dims)`. 352 | Returns: 353 | Tensor: Results with shape [1, num_query, bs, embed_dims] when 354 | return_intermediate is `False`, otherwise it has shape 355 | [num_layers, num_query, bs, embed_dims]. 356 | """ 357 | if not self.return_intermediate: 358 | x = super().forward(query, *args, **kwargs) 359 | if self.post_norm: 360 | x = self.post_norm(x)[None] 361 | return x 362 | 363 | intermediate = [] 364 | for layer in self.layers: 365 | query = layer(query, *args, **kwargs) 366 | if self.return_intermediate: 367 | if self.post_norm is not None: 368 | intermediate.append(self.post_norm(query)) 369 | else: 370 | intermediate.append(query) 371 | return torch.stack(intermediate) 372 | 373 | 374 | @TRANSFORMER_LAYER.register_module() 375 | class PETRTransformerDecoderLayer(BaseTransformerLayer): 376 | """Implements decoder layer in DETR transformer. 377 | Args: 378 | attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): 379 | Configs for self_attention or cross_attention, the order 380 | should be consistent with it in `operation_order`. If it is 381 | a dict, it would be expand to the number of attention in 382 | `operation_order`. 383 | feedforward_channels (int): The hidden dimension for FFNs. 384 | ffn_dropout (float): Probability of an element to be zeroed 385 | in ffn. Default 0.0. 386 | operation_order (tuple[str]): The execution order of operation 387 | in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). 388 | Default:None 389 | act_cfg (dict): The activation config for FFNs. Default: `LN` 390 | norm_cfg (dict): Config dict for normalization layer. 391 | Default: `LN`. 392 | ffn_num_fcs (int): The number of fully-connected layers in FFNs. 393 | Default:2. 394 | """ 395 | 396 | def __init__(self, 397 | attn_cfgs, 398 | feedforward_channels, 399 | ffn_dropout=0.0, 400 | operation_order=None, 401 | act_cfg=dict(type='ReLU', inplace=True), 402 | norm_cfg=dict(type='LN'), 403 | ffn_num_fcs=2, 404 | with_cp=True, 405 | **kwargs): 406 | super(PETRTransformerDecoderLayer, self).__init__( 407 | attn_cfgs=attn_cfgs, 408 | feedforward_channels=feedforward_channels, 409 | ffn_dropout=ffn_dropout, 410 | operation_order=operation_order, 411 | act_cfg=act_cfg, 412 | norm_cfg=norm_cfg, 413 | ffn_num_fcs=ffn_num_fcs, 414 | **kwargs) 415 | assert len(operation_order) == 6 416 | assert set(operation_order) == set( 417 | ['self_attn', 'norm', 'cross_attn', 'ffn']) 418 | self.use_checkpoint = with_cp 419 | 420 | def _forward(self, 421 | query, 422 | key=None, 423 | value=None, 424 | query_pos=None, 425 | key_pos=None, 426 | attn_masks=None, 427 | query_key_padding_mask=None, 428 | key_padding_mask=None, 429 | ): 430 | """Forward function for `TransformerCoder`. 431 | Returns: 432 | Tensor: forwarded results with shape [num_query, bs, embed_dims]. 433 | """ 434 | x = super(PETRTransformerDecoderLayer, self).forward( 435 | query, 436 | key=key, 437 | value=value, 438 | query_pos=query_pos, 439 | key_pos=key_pos, 440 | attn_masks=attn_masks, 441 | query_key_padding_mask=query_key_padding_mask, 442 | key_padding_mask=key_padding_mask, 443 | ) 444 | 445 | return x 446 | 447 | def forward(self, 448 | query, 449 | key=None, 450 | value=None, 451 | query_pos=None, 452 | key_pos=None, 453 | attn_masks=None, 454 | query_key_padding_mask=None, 455 | key_padding_mask=None, 456 | **kwargs 457 | ): 458 | """Forward function for `TransformerCoder`. 459 | Returns: 460 | Tensor: forwarded results with shape [num_query, bs, embed_dims]. 461 | """ 462 | 463 | if self.use_checkpoint and self.training: 464 | x = cp.checkpoint( 465 | self._forward, 466 | query, 467 | key, 468 | value, 469 | query_pos, 470 | key_pos, 471 | attn_masks, 472 | query_key_padding_mask, 473 | key_padding_mask, 474 | ) 475 | else: 476 | x = self._forward( 477 | query, 478 | key=key, 479 | value=value, 480 | query_pos=query_pos, 481 | key_pos=key_pos, 482 | attn_masks=attn_masks, 483 | query_key_padding_mask=query_key_padding_mask, 484 | key_padding_mask=key_padding_mask 485 | ) 486 | 487 | return x 488 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/dense_heads/fstr_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import math 3 | import copy 4 | import torch 5 | import torch.nn as nn 6 | from mmcv.cnn import build_conv_layer 7 | from mmcv.runner import BaseModule, force_fp32 8 | from mmdet.core import (build_assigner, build_sampler, multi_apply, 9 | reduce_mean, build_bbox_coder) 10 | from mmdet.models.utils import build_transformer 11 | from mmdet.models import HEADS, build_loss 12 | from mmdet.models.utils.transformer import inverse_sigmoid 13 | from mmdet3d.models.utils.clip_sigmoid import clip_sigmoid 14 | from mmdet3d.models import builder 15 | from einops import rearrange 16 | import collections 17 | 18 | from functools import reduce 19 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox 20 | from mmdet3d.ops import make_sparse_convmodule 21 | import spconv.pytorch as spconv 22 | from mmcv.cnn import build_conv_layer 23 | import copy 24 | from spconv.core import ConvAlgo 25 | 26 | def pos2embed(pos, num_pos_feats=128, temperature=10000): 27 | scale = 2 * math.pi 28 | pos = pos * scale 29 | dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) 30 | dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) 31 | pos_x = pos[..., 0, None] / dim_t 32 | pos_y = pos[..., 1, None] / dim_t 33 | # pos_z = pos[..., 2, None] / dim_t 34 | pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) 35 | pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) 36 | # pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2) 37 | posemb = torch.cat((pos_y, pos_x), dim=-1) 38 | return posemb 39 | 40 | 41 | class LayerNormFunction(torch.autograd.Function): 42 | 43 | @staticmethod 44 | def forward(ctx, x, weight, bias, groups, eps): 45 | ctx.groups = groups 46 | ctx.eps = eps 47 | N, C, L = x.size() 48 | x = x.view(N, groups, C // groups, L) 49 | mu = x.mean(2, keepdim=True) 50 | var = (x - mu).pow(2).mean(2, keepdim=True) 51 | y = (x - mu) / (var + eps).sqrt() 52 | ctx.save_for_backward(y, var, weight) 53 | y = weight.view(1, C, 1) * y.view(N, C, L) + bias.view(1, C, 1) 54 | return y 55 | 56 | @staticmethod 57 | def backward(ctx, grad_output): 58 | groups = ctx.groups 59 | eps = ctx.eps 60 | 61 | N, C, L = grad_output.size() 62 | y, var, weight = ctx.saved_variables 63 | g = grad_output * weight.view(1, C, 1) 64 | g = g.view(N, groups, C//groups, L) 65 | mean_g = g.mean(dim=2, keepdim=True) 66 | mean_gy = (g * y).mean(dim=2, keepdim=True) 67 | gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g) 68 | return gx.view(N, C, L), (grad_output * y.view(N, C, L)).sum(dim=2).sum(dim=0), grad_output.sum(dim=2).sum( 69 | dim=0), None, None 70 | 71 | 72 | class GroupLayerNorm1d(nn.Module): 73 | 74 | def __init__(self, channels, groups=1, eps=1e-6): 75 | super(GroupLayerNorm1d, self).__init__() 76 | self.register_parameter('weight', nn.Parameter(torch.ones(channels))) 77 | self.register_parameter('bias', nn.Parameter(torch.zeros(channels))) 78 | self.groups = groups 79 | self.eps = eps 80 | 81 | def forward(self, x): 82 | return LayerNormFunction.apply(x, self.weight, self.bias, self.groups, self.eps) 83 | 84 | 85 | @HEADS.register_module() 86 | class SeparateTaskHead(BaseModule): 87 | """SeparateHead for CenterHead. 88 | 89 | Args: 90 | in_channels (int): Input channels for conv_layer. 91 | heads (dict): Conv information. 92 | head_conv (int): Output channels. 93 | Default: 64. 94 | final_kernal (int): Kernal size for the last conv layer. 95 | Deafult: 1. 96 | init_bias (float): Initial bias. Default: -2.19. 97 | conv_cfg (dict): Config of conv layer. 98 | Default: dict(type='Conv2d') 99 | norm_cfg (dict): Config of norm layer. 100 | Default: dict(type='BN2d'). 101 | bias (str): Type of bias. Default: 'auto'. 102 | """ 103 | 104 | def __init__(self, 105 | in_channels, 106 | heads, 107 | groups=1, 108 | head_conv=64, 109 | final_kernel=1, 110 | init_bias=-2.19, 111 | init_cfg=None, 112 | **kwargs): 113 | assert init_cfg is None, 'To prevent abnormal initialization ' \ 114 | 'behavior, init_cfg is not allowed to be set' 115 | super(SeparateTaskHead, self).__init__(init_cfg=init_cfg) 116 | self.heads = heads 117 | self.groups = groups 118 | self.init_bias = init_bias 119 | for head in self.heads: 120 | classes, num_conv = self.heads[head] 121 | 122 | conv_layers = [] 123 | c_in = in_channels 124 | for i in range(num_conv - 1): 125 | conv_layers.extend([ 126 | nn.Conv1d( 127 | c_in * groups, 128 | head_conv * groups, 129 | kernel_size=final_kernel, 130 | stride=1, 131 | padding=final_kernel // 2, 132 | groups=groups, 133 | bias=False), 134 | GroupLayerNorm1d(head_conv * groups, groups=groups), 135 | nn.ReLU(inplace=True) 136 | ]) 137 | c_in = head_conv 138 | 139 | conv_layers.append( 140 | nn.Conv1d( 141 | head_conv * groups, 142 | classes * groups, 143 | kernel_size=final_kernel, 144 | stride=1, 145 | padding=final_kernel // 2, 146 | groups=groups, 147 | bias=True)) 148 | conv_layers = nn.Sequential(*conv_layers) 149 | 150 | self.__setattr__(head, conv_layers) 151 | 152 | if init_cfg is None: 153 | self.init_cfg = dict(type='Kaiming', layer='Conv1d') 154 | 155 | def init_weights(self): 156 | """Initialize weights.""" 157 | super().init_weights() 158 | for head in self.heads: 159 | if head == 'cls_logits': 160 | self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) 161 | 162 | def forward(self, x): 163 | """Forward function for SepHead. 164 | 165 | Args: 166 | x (torch.Tensor): Input feature map with the shape of 167 | [N, B, query, C]. 168 | 169 | Returns: 170 | dict[str: torch.Tensor]: contains the following keys: 171 | 172 | -reg (torch.Tensor): 2D regression value with the \ 173 | shape of [N, B, query, 2]. 174 | -height (torch.Tensor): Height value with the \ 175 | shape of [N, B, query, 1]. 176 | -dim (torch.Tensor): Size value with the shape \ 177 | of [N, B, query, 3]. 178 | -rot (torch.Tensor): Rotation value with the \ 179 | shape of [N, B, query, 2]. 180 | -vel (torch.Tensor): Velocity value with the \ 181 | shape of [N, B, query, 2]. 182 | """ 183 | N, B, query_num, c1 = x.shape 184 | x = rearrange(x, "n b q c -> b (n c) q") 185 | ret_dict = dict() 186 | 187 | for head in self.heads: 188 | head_output = self.__getattr__(head)(x) 189 | ret_dict[head] = rearrange(head_output, "b (n c) q -> n b q c", n=N) 190 | 191 | return ret_dict 192 | 193 | 194 | 195 | @HEADS.register_module() 196 | class FSTRHead(BaseModule): 197 | "only init lidar proposal query" 198 | def __init__(self, 199 | in_channels, 200 | num_init_query = 200, 201 | num_query=900, 202 | max_sparse_token_per_sample = 10000, 203 | proposal_head_kernel = 3, 204 | hidden_dim=128, 205 | norm_bbox=True, 206 | downsample_scale=8, 207 | scalar=10, 208 | noise_scale=1.0, 209 | noise_trans=0.0, 210 | dn_weight=1.0, 211 | split=0.75, 212 | depth_num=64, 213 | nms_kernel_size=3, 214 | init_dn_query=False, 215 | init_learnable_query = False, 216 | init_query_topk = 1, 217 | init_query_radius = 1, 218 | gauusian_dn_sampling=False, 219 | noise_mean = 0.5, 220 | noise_std = 0.125, 221 | train_cfg=None, 222 | test_cfg=None, 223 | common_heads=dict( 224 | center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2) 225 | ), 226 | tasks=[ 227 | dict(num_class=1, class_names=['car']), 228 | dict(num_class=2, class_names=['truck', 'construction_vehicle']), 229 | dict(num_class=2, class_names=['bus', 'trailer']), 230 | dict(num_class=1, class_names=['barrier']), 231 | dict(num_class=2, class_names=['motorcycle', 'bicycle']), 232 | dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), 233 | ], 234 | transformer=None, 235 | bbox_coder=None, 236 | loss_cls=dict( 237 | type="FocalLoss", 238 | use_sigmoid=True, 239 | reduction="mean", 240 | gamma=2, alpha=0.25, loss_weight=1.0 241 | ), 242 | loss_bbox=dict( 243 | type="L1Loss", 244 | reduction="mean", 245 | loss_weight=0.25, 246 | ), 247 | loss_heatmap=dict( 248 | type="GuassianFocalLoss", 249 | reduction="mean" 250 | ), 251 | separate_head=dict( 252 | type='SeparateMlpHead', init_bias=-2.19, final_kernel=3), 253 | init_cfg=None, 254 | **kwargs): 255 | super(FSTRHead, self).__init__(**kwargs) 256 | 257 | 258 | self.num_classes = [len(t["class_names"]) for t in tasks] 259 | self.class_names = [t["class_names"] for t in tasks] 260 | self.hidden_dim = hidden_dim 261 | self.train_cfg = train_cfg 262 | self.test_cfg = test_cfg 263 | self.num_query = num_query 264 | self.in_channels = in_channels 265 | self.norm_bbox = norm_bbox 266 | self.downsample_scale = downsample_scale 267 | self.scalar = scalar 268 | self.bbox_noise_scale = noise_scale 269 | self.bbox_noise_trans = noise_trans 270 | self.dn_weight = dn_weight 271 | self.split = split 272 | self.depth_num = depth_num 273 | self.nms_kernel_size = nms_kernel_size 274 | self.num_proposals = num_query 275 | self.loss_cls = build_loss(loss_cls) 276 | self.loss_bbox = build_loss(loss_bbox) 277 | self.loss_heatmap = build_loss(loss_heatmap) 278 | self.bbox_coder = build_bbox_coder(bbox_coder) 279 | self.pc_range = self.bbox_coder.pc_range 280 | self.fp16_enabled = False 281 | self.init_dn_query = init_dn_query 282 | self.init_learnable_query = init_learnable_query 283 | self.gauusian_dn_sampling = gauusian_dn_sampling 284 | self.noise_mean = noise_mean 285 | self.noise_std = noise_std 286 | self.init_query_topk = init_query_topk 287 | self.init_query_radius = init_query_radius 288 | 289 | # transformer 290 | self.transformer = build_transformer(transformer) 291 | # self.reference_points = nn.Embedding(num_query, 3) 292 | self.bev_embedding = nn.Sequential( 293 | nn.Linear(hidden_dim * 2, hidden_dim), 294 | nn.ReLU(inplace=True), 295 | nn.Linear(hidden_dim, hidden_dim) 296 | ) 297 | 298 | # task head 299 | self.task_heads = nn.ModuleList() 300 | for num_cls in self.num_classes: 301 | heads = copy.deepcopy(common_heads) 302 | heads.update(dict(cls_logits=(num_cls, 2))) 303 | separate_head.update( 304 | in_channels=hidden_dim, 305 | heads=heads, num_cls=num_cls, 306 | groups=transformer.decoder.num_layers 307 | ) 308 | self.task_heads.append(builder.build_head(separate_head)) 309 | 310 | # assigner 311 | if train_cfg: 312 | self.assigner = build_assigner(train_cfg["assigner"]) 313 | sampler_cfg = dict(type='PseudoSampler') 314 | self.sampler = build_sampler(sampler_cfg, context=self) 315 | 316 | 317 | self.num_init_query = num_init_query 318 | assert self.num_init_query < self.num_query, "number of init query must less than number of query" 319 | self.reference_points = nn.Embedding(self.num_query - self.num_init_query, 3) 320 | self.class_encoding = nn.Sequential() 321 | self.shared_conv = make_sparse_convmodule( 322 | self.in_channels, 323 | self.hidden_dim, 324 | (3,3), 325 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 326 | padding=(1,1), 327 | indice_key='head_spconv_1', 328 | conv_type='SubMConv2d', 329 | order=('conv', 'norm', 'act')) 330 | self.sparse_maxpool_2d = spconv.SparseMaxPool2d(3, 1, 1, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3') 331 | self.sparse_maxpool_2d_small = spconv.SparseMaxPool2d(1, 1, 0, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3') 332 | self.max_sparse_token_per_sample = max_sparse_token_per_sample 333 | 334 | # for sparse heatmap 335 | self.proposal_head_kernel = proposal_head_kernel 336 | output_channels = sum(self.num_classes) 337 | num_conv = 2 338 | self.heatmap_head = nn.Sequential() 339 | fc_list = [] 340 | for k in range(num_conv - 1): 341 | fc_list.append( 342 | make_sparse_convmodule( 343 | self.hidden_dim, 344 | self.hidden_dim, 345 | self.proposal_head_kernel, 346 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 347 | padding=int(self.proposal_head_kernel//2), 348 | indice_key='head_spconv_1', 349 | conv_type='SubMConv2d', 350 | order=('conv', 'norm', 'act')), 351 | ) 352 | fc_list.append(build_conv_layer( 353 | dict(type='SubMConv2d', indice_key='hm_out'), 354 | self.hidden_dim, 355 | sum(self.num_classes), 356 | 1, 357 | stride=1, 358 | padding=0, 359 | bias=True)) 360 | 361 | 362 | self.sparse_hm_layer = nn.Sequential(*fc_list) 363 | self.sparse_hm_layer[-1].bias.data.fill_(-2.19) 364 | 365 | @property 366 | def coords_bev(self): 367 | cfg = self.train_cfg if self.train_cfg else self.test_cfg 368 | x_size, y_size = ( 369 | cfg['grid_size'][1] // self.downsample_scale, 370 | cfg['grid_size'][0] // self.downsample_scale 371 | ) 372 | meshgrid = [[0, y_size - 1, y_size], [0, x_size - 1, x_size]] 373 | batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) 374 | batch_x = (batch_x + 0.5) / x_size 375 | batch_y = (batch_y + 0.5) / y_size 376 | coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0) 377 | coord_base = coord_base.view(2, -1).transpose(1, 0) # (H*W, 2) 378 | return coord_base 379 | def init_weights(self): 380 | super(FSTRHead, self).init_weights() 381 | nn.init.uniform_(self.reference_points.weight.data, 0, 1) 382 | 383 | def _bev_query_embed(self, ref_points, img_metas): 384 | bev_embeds = self.bev_embedding(pos2embed(ref_points, num_pos_feats=self.hidden_dim)) 385 | return bev_embeds 386 | def forward(self, points_feats, img_metas=None): 387 | """ 388 | list([bs, c, h, w]) 389 | """ 390 | img_metas = [img_metas] 391 | return multi_apply(self.forward_single, points_feats, img_metas) 392 | 393 | def forward_single(self, x, img_metas): 394 | """ 395 | x: [bs c h w] 396 | return List(dict(head_name: [num_dec x bs x num_query * head_dim]) ) x task_num 397 | """ 398 | ret_dicts = [] 399 | batch_size = len(img_metas) 400 | x = self.shared_conv(x) 401 | x_feature = torch.zeros(*(x.features.shape),device = x.features.device) 402 | x_feature[:,:] = x.features 403 | x_batch_indices = torch.zeros(x.indices.shape[0],1,device = x.features.device) 404 | x_ind = torch.zeros(x.indices.shape[0],2,device = x.features.device) 405 | x_2dpos = torch.zeros(x.indices.shape[0],2,device = x.features.device) 406 | x_batch_indices[:,:] = x.indices[:,:1] 407 | x_ind[:,:] = x.indices[:,-2:] 408 | x_ind = x_ind.to(torch.float32) 409 | cfg = self.train_cfg if self.train_cfg else self.test_cfg 410 | y_size, x_size = x.spatial_shape 411 | x_2dpos[:,0] = (x_ind[:,1] + 0.5) / x_size 412 | x_2dpos[:,1] = (x_ind[:,0] + 0.5) / y_size 413 | batch_size = int(x.batch_size) 414 | 415 | sparse_hm = self.sparse_hm_layer(x) 416 | sparse_hm_clone = spconv.SparseConvTensor( 417 | features=sparse_hm.features.clone().detach().sigmoid(), 418 | indices=sparse_hm.indices.clone(), 419 | spatial_shape=sparse_hm.spatial_shape, 420 | batch_size=sparse_hm.batch_size 421 | ) 422 | x_hm_max = self.sparse_maxpool_2d(sparse_hm_clone, True) 423 | x_hm_max_small = self.sparse_maxpool_2d_small(sparse_hm_clone, True) 424 | 425 | 426 | selected = (x_hm_max.features == sparse_hm_clone.features) 427 | selected_small = (x_hm_max_small.features == sparse_hm_clone.features) 428 | selected[:,8] = selected_small[:,8] 429 | selected[:,9] = selected_small[:,9] 430 | 431 | score = sparse_hm_clone.features * selected 432 | score, _ = score.topk(1,dim=1) 433 | proposal_list = [] 434 | proposal_feature = [] 435 | # topk for each sample in batch 436 | for i in range(batch_size): 437 | mask = (x_batch_indices == i).squeeze(-1) 438 | sample_voxel_pos = x_2dpos[mask] 439 | sample_voxel_hm = score[mask] 440 | sample_voxel_feature = x_feature[mask] 441 | _, proposal_ind = sample_voxel_hm.topk(self.num_init_query,dim=0) 442 | proposal_list.append(sample_voxel_pos.gather(0, proposal_ind.repeat(1,2))[None,...]) 443 | proposal_feature.append(sample_voxel_feature.gather(0, proposal_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...]) 444 | query_pos = torch.cat(proposal_list,dim=0) 445 | query_init_feature = torch.cat(proposal_feature,dim=0) 446 | 447 | reference_points = self.reference_points.weight 448 | reference_points = reference_points.unsqueeze(0).repeat(batch_size,1,1) 449 | 450 | init_reference_points = torch.cat([query_pos,0.5*torch.ones([*query_pos.shape[:-1],1]).to(query_pos.device)],dim=-1) 451 | 452 | reference_points = torch.cat([init_reference_points, reference_points],dim=1) 453 | 454 | reference_points, attn_mask, mask_dict = self.prepare_for_dn(batch_size, reference_points, img_metas) 455 | 456 | pad_size = mask_dict['pad_size'] if mask_dict is not None else 0 457 | 458 | target = self.get_sparse_init_query(reference_points, x_feature , x_2dpos, x_batch_indices, pad_size) 459 | 460 | bev_pos_embeds = self.bev_embedding(pos2embed(x_2dpos, num_pos_feats=self.hidden_dim)) 461 | 462 | bev_query_embeds = self.query_embed(reference_points, img_metas) 463 | query_embeds = bev_query_embeds 464 | 465 | 466 | # pad or drop 467 | 468 | batch_feature = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device) 469 | batch_bevemb = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device) 470 | 471 | for i in range(batch_size): 472 | sample_token_num = (x_batch_indices==i).sum() 473 | batch_token_num = min(sample_token_num,self.max_sparse_token_per_sample) 474 | mask = (x_batch_indices == i).squeeze(-1) 475 | sample_voxel_hm = score[mask] 476 | sample_voxel_feature = x_feature[mask] 477 | sample_voxel_bev_emb = bev_pos_embeds[mask] 478 | _, voxel_ind = sample_voxel_hm.topk(batch_token_num,dim=0) 479 | # a = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...] 480 | batch_feature[i][:batch_token_num] = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1])) 481 | batch_bevemb[i][:batch_token_num] = sample_voxel_bev_emb.gather(0, voxel_ind.repeat(1,sample_voxel_bev_emb.shape[1])) 482 | 483 | outs_dec, _ = self.transformer( 484 | batch_feature, query_embeds, 485 | batch_bevemb, 486 | attn_masks=attn_mask, 487 | target = target 488 | ) 489 | outs_dec = torch.nan_to_num(outs_dec) 490 | 491 | reference = inverse_sigmoid(reference_points.clone()) 492 | 493 | flag = 0 494 | for task_id, task in enumerate(self.task_heads, 0): 495 | outs = task(outs_dec) 496 | center = (outs['center'] + reference[None, :, :, :2]).sigmoid() 497 | height = (outs['height'] + reference[None, :, :, 2:3]).sigmoid() 498 | _center, _height = center.new_zeros(center.shape), height.new_zeros(height.shape) 499 | _center[..., 0:1] = center[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0] 500 | _center[..., 1:2] = center[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1] 501 | _height[..., 0:1] = height[..., 0:1] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2] 502 | outs['center'] = _center 503 | outs['height'] = _height 504 | 505 | if mask_dict and mask_dict['pad_size'] > 0: 506 | task_mask_dict = copy.deepcopy(mask_dict) 507 | class_name = self.class_names[task_id] 508 | 509 | known_lbs_bboxes_label = task_mask_dict['known_lbs_bboxes'][0] 510 | known_labels_raw = task_mask_dict['known_labels_raw'] 511 | new_lbs_bboxes_label = known_lbs_bboxes_label.new_zeros(known_lbs_bboxes_label.shape) 512 | new_lbs_bboxes_label[:] = len(class_name) 513 | new_labels_raw = known_labels_raw.new_zeros(known_labels_raw.shape) 514 | new_labels_raw[:] = len(class_name) 515 | task_masks = [ 516 | torch.where(known_lbs_bboxes_label == class_name.index(i) + flag) 517 | for i in class_name 518 | ] 519 | task_masks_raw = [ 520 | torch.where(known_labels_raw == class_name.index(i) + flag) 521 | for i in class_name 522 | ] 523 | for cname, task_mask, task_mask_raw in zip(class_name, task_masks, task_masks_raw): 524 | new_lbs_bboxes_label[task_mask] = class_name.index(cname) 525 | new_labels_raw[task_mask_raw] = class_name.index(cname) 526 | task_mask_dict['known_lbs_bboxes'] = (new_lbs_bboxes_label, task_mask_dict['known_lbs_bboxes'][1]) 527 | task_mask_dict['known_labels_raw'] = new_labels_raw 528 | flag += len(class_name) 529 | 530 | for key in list(outs.keys()): 531 | outs['dn_' + key] = outs[key][:, :, :mask_dict['pad_size'], :] 532 | outs[key] = outs[key][:, :, mask_dict['pad_size']:, :] 533 | outs['dn_mask_dict'] = task_mask_dict 534 | 535 | ret_dicts.append(outs) 536 | ret_dicts[0]['sparse_heatmap'] = sparse_hm 537 | return ret_dicts 538 | 539 | 540 | def get_sparse_init_query(self, ref_points, x_feature, x_2dpos , x_batch_indices, pad_size): 541 | 542 | total_range = self.pc_range[3]-self.pc_range[0] 543 | radius = self.init_query_radius 544 | diameter = (2 * radius + 1)/total_range 545 | sigma = diameter / 6 546 | # masked_gaussian = torch.exp(- distances / (2 * sigma * sigma)) 547 | query_feature_list = [] 548 | batch_size = ref_points.shape[0] 549 | 550 | for bs in range(batch_size): 551 | sample_q = ref_points[bs][:,:2] 552 | sample_mask = x_batch_indices[:,0] == bs 553 | sample_token = x_feature[sample_mask] 554 | sample_pos = x_2dpos[sample_mask] 555 | with torch.no_grad(): 556 | dis_mat = sample_q.unsqueeze(1) - sample_pos.unsqueeze(0) 557 | dis_mat = -(dis_mat ** 2).sum(-1) 558 | nearest_dis_topk,nearest_order_topk = dis_mat.topk(self.init_query_topk ,dim=1,sorted= True) 559 | gaussian_weight = torch.exp( nearest_dis_topk / (2 * sigma * sigma)) 560 | gaussian_weight_sum = torch.clip(gaussian_weight.sum(-1),1) 561 | 562 | flatten_order = nearest_order_topk.view(-1,self.init_query_topk) 563 | flatten_weight = (gaussian_weight/gaussian_weight_sum.unsqueeze(1)).view(-1,self.init_query_topk) 564 | feature = (sample_token.gather(0, flatten_order.repeat(1,sample_token.shape[1]))*flatten_weight).view(-1,self.init_query_topk,sample_token.shape[1]).sum(1).unsqueeze(0) 565 | query_feature_list.append(feature) 566 | 567 | query_feature = torch.cat(query_feature_list,dim=0) 568 | if not self.init_dn_query: 569 | query_feature[:,:pad_size,:] *=0 570 | if not self.init_learnable_query: 571 | query_feature[:,pad_size+self.num_init_query:,:] *=0 572 | query_feature = query_feature.permute(1,0,2) 573 | 574 | 575 | return query_feature 576 | 577 | 578 | def prepare_for_dn(self, batch_size, reference_points, img_metas): 579 | if self.training: 580 | targets = [torch.cat((img_meta['gt_bboxes_3d']._data.gravity_center, img_meta['gt_bboxes_3d']._data.tensor[:, 3:]),dim=1) for img_meta in img_metas ] 581 | labels = [img_meta['gt_labels_3d']._data for img_meta in img_metas ] 582 | 583 | known = [(torch.ones_like(t)).cuda() for t in labels] 584 | know_idx = known 585 | unmask_bbox = unmask_label = torch.cat(known) 586 | known_num = [t.size(0) for t in targets] 587 | labels = torch.cat([t for t in labels]) 588 | boxes = torch.cat([t for t in targets]) 589 | batch_idx = torch.cat([torch.full((t.size(0),), i) for i, t in enumerate(targets)]) 590 | 591 | known_indice = torch.nonzero(unmask_label + unmask_bbox) 592 | known_indice = known_indice.view(-1) 593 | # add noise 594 | groups = min(self.scalar, self.num_query // max(known_num)) 595 | known_indice = known_indice.repeat(groups, 1).view(-1) 596 | known_labels = labels.repeat(groups, 1).view(-1).long().to(reference_points.device) 597 | known_labels_raw = labels.repeat(groups, 1).view(-1).long().to(reference_points.device) 598 | known_bid = batch_idx.repeat(groups, 1).view(-1) 599 | known_bboxs = boxes.repeat(groups, 1).to(reference_points.device) 600 | known_bbox_center = known_bboxs[:, :3].clone() 601 | known_bbox_scale = known_bboxs[:, 3:6].clone() 602 | 603 | # known_one_hot = F.one_hot(known_labels, self.num_classes[0]).permute(1,0) 604 | # known_query_cat_encoding = self.class_encoding(known_one_hot.float().unsqueeze(0)) 605 | if self.bbox_noise_scale > 0: 606 | diff = known_bbox_scale / 2 + self.bbox_noise_trans 607 | if self.gauusian_dn_sampling: 608 | rand_prob = torch.randn_like(known_bbox_center)*self.noise_std + self.noise_mean 609 | rand_pn = torch.rand_like(known_bbox_center) 610 | p_mask = rand_pn>0.5 611 | n_mask = rand_pn<=0.5 612 | rand_prob[n_mask] *= -1 613 | else: 614 | rand_prob = torch.rand_like(known_bbox_center) * 2 - 1.0 615 | known_bbox_center += torch.mul(rand_prob, diff) * self.bbox_noise_scale 616 | known_bbox_center[..., 0:1] = (known_bbox_center[..., 0:1] - self.pc_range[0]) / ( 617 | self.pc_range[3] - self.pc_range[0] 618 | ) 619 | known_bbox_center[..., 1:2] = (known_bbox_center[..., 1:2] - self.pc_range[1]) / ( 620 | self.pc_range[4] - self.pc_range[1] 621 | ) 622 | known_bbox_center[..., 2:3] = (known_bbox_center[..., 2:3] - self.pc_range[2]) / ( 623 | self.pc_range[5] - self.pc_range[2] 624 | ) 625 | known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0) 626 | mask = torch.norm(rand_prob, 2, 1) > self.split 627 | known_labels[mask] = sum(self.num_classes) 628 | 629 | single_pad = int(max(known_num)) 630 | pad_size = int(single_pad * groups) 631 | padding_bbox = torch.zeros(batch_size,pad_size, 3).to(reference_points.device) 632 | # padding_cls_encoding = torch.zeros(batch_size,query_cat_encoding.shape[1],pad_size).to(reference_points.device) 633 | padded_reference_points = torch.cat([padding_bbox, reference_points], dim=1) 634 | # padding_query_cat_encoding = torch.cat([padding_cls_encoding, query_cat_encoding], dim=-1) 635 | # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1) 636 | # known_query_cat_encoding = known_query_cat_encoding.permute(0,2,1) 637 | 638 | if len(known_num): 639 | map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3] 640 | map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(groups)]).long() 641 | if len(known_bid): 642 | padded_reference_points[(known_bid.long(), map_known_indice)] = known_bbox_center.to(reference_points.device) 643 | # padding_query_cat_encoding[(known_bid.long(), map_known_indice)] = known_query_cat_encoding 644 | 645 | # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1) 646 | tgt_size = pad_size + self.num_query 647 | attn_mask = torch.ones(tgt_size, tgt_size).to(reference_points.device) < 0 648 | # match query cannot see the reconstruct 649 | attn_mask[pad_size:, :pad_size] = True 650 | # reconstruct cannot see each other 651 | for i in range(groups): 652 | if i == 0: 653 | attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True 654 | if i == groups - 1: 655 | attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True 656 | else: 657 | attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True 658 | attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True 659 | 660 | mask_dict = { 661 | "known_indice": torch.as_tensor(known_indice).long(), 662 | "batch_idx": torch.as_tensor(batch_idx).long(), 663 | "map_known_indice": torch.as_tensor(map_known_indice).long(), 664 | "known_lbs_bboxes": (known_labels, known_bboxs), 665 | "known_labels_raw": known_labels_raw, 666 | "know_idx": know_idx, 667 | "pad_size": pad_size, 668 | } 669 | 670 | else: 671 | padded_reference_points = reference_points 672 | attn_mask = None 673 | mask_dict = None 674 | # padding_query_cat_encoding = query_cat_encoding 675 | 676 | return padded_reference_points, attn_mask, mask_dict 677 | 678 | @force_fp32(apply_to=('preds_dicts')) 679 | def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): 680 | """"Loss function. 681 | Args: 682 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9) 683 | gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, ) 684 | preds_dicts(tuple[list[dict]]): nb_tasks x num_lvl 685 | center: (num_dec, batch_size, num_query, 2) 686 | height: (num_dec, batch_size, num_query, 1) 687 | dim: (num_dec, batch_size, num_query, 3) 688 | rot: (num_dec, batch_size, num_query, 2) 689 | vel: (num_dec, batch_size, num_query, 2) 690 | cls_logits: (num_dec, batch_size, num_query, task_classes) 691 | Returns: 692 | dict[str, Tensor]: A dictionary of loss components. 693 | """ 694 | num_decoder = preds_dicts[0][0]['center'].shape[0] 695 | all_pred_bboxes, all_pred_logits = collections.defaultdict(list), collections.defaultdict(list) 696 | 697 | for task_id, preds_dict in enumerate(preds_dicts, 0): 698 | for dec_id in range(num_decoder): 699 | pred_bbox = torch.cat( 700 | (preds_dict[0]['center'][dec_id], preds_dict[0]['height'][dec_id], 701 | preds_dict[0]['dim'][dec_id], preds_dict[0]['rot'][dec_id], 702 | preds_dict[0]['vel'][dec_id]), 703 | dim=-1 704 | ) 705 | all_pred_bboxes[dec_id].append(pred_bbox) 706 | all_pred_logits[dec_id].append(preds_dict[0]['cls_logits'][dec_id]) 707 | all_pred_bboxes = [all_pred_bboxes[idx] for idx in range(num_decoder)] 708 | all_pred_logits = [all_pred_logits[idx] for idx in range(num_decoder)] 709 | 710 | loss_cls, loss_bbox = multi_apply( 711 | self.loss_single, all_pred_bboxes, all_pred_logits, 712 | [gt_bboxes_3d for _ in range(num_decoder)], 713 | [gt_labels_3d for _ in range(num_decoder)], 714 | ) 715 | 716 | loss_dict = dict() 717 | loss_dict['loss_cls'] = loss_cls[-1] 718 | loss_dict['loss_bbox'] = loss_bbox[-1] 719 | 720 | num_dec_layer = 0 721 | for loss_cls_i, loss_bbox_i in zip(loss_cls[:-1], 722 | loss_bbox[:-1]): 723 | loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i 724 | loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i 725 | num_dec_layer += 1 726 | 727 | dn_pred_bboxes, dn_pred_logits = collections.defaultdict(list), collections.defaultdict(list) 728 | dn_mask_dicts = collections.defaultdict(list) 729 | for task_id, preds_dict in enumerate(preds_dicts, 0): 730 | for dec_id in range(num_decoder): 731 | pred_bbox = torch.cat( 732 | (preds_dict[0]['dn_center'][dec_id], preds_dict[0]['dn_height'][dec_id], 733 | preds_dict[0]['dn_dim'][dec_id], preds_dict[0]['dn_rot'][dec_id], 734 | preds_dict[0]['dn_vel'][dec_id]), 735 | dim=-1 736 | ) 737 | dn_pred_bboxes[dec_id].append(pred_bbox) 738 | dn_pred_logits[dec_id].append(preds_dict[0]['dn_cls_logits'][dec_id]) 739 | dn_mask_dicts[dec_id].append(preds_dict[0]['dn_mask_dict']) 740 | dn_pred_bboxes = [dn_pred_bboxes[idx] for idx in range(num_decoder)] 741 | dn_pred_logits = [dn_pred_logits[idx] for idx in range(num_decoder)] 742 | dn_mask_dicts = [dn_mask_dicts[idx] for idx in range(num_decoder)] 743 | dn_loss_cls, dn_loss_bbox = multi_apply( 744 | self.dn_loss_single, dn_pred_bboxes, dn_pred_logits, dn_mask_dicts 745 | ) 746 | 747 | loss_dict['dn_loss_cls'] = dn_loss_cls[-1] 748 | loss_dict['dn_loss_bbox'] = dn_loss_bbox[-1] 749 | num_dec_layer = 0 750 | for loss_cls_i, loss_bbox_i in zip(dn_loss_cls[:-1], 751 | dn_loss_bbox[:-1]): 752 | loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i 753 | loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i 754 | num_dec_layer += 1 755 | 756 | sparse_hm_voxel = preds_dict[0]['sparse_heatmap'] 757 | spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels = self._get_voxel_infos(sparse_hm_voxel) 758 | voxel_hp_target = multi_apply( 759 | self.sparse_hp_target_single, 760 | gt_bboxes_3d, 761 | gt_labels_3d, 762 | num_voxels, 763 | spatial_indices, 764 | ) 765 | # voxel_hp_target = self.sparse_hp_target_single(sparse_hm_voxel, gt_bboxes_3d,gt_labels_3d) 766 | # TODO: Fix bugs for hp target (uncorrect when batchsize != 1) 767 | hp_target = [ t.permute(1,0) for t in voxel_hp_target[0]] 768 | hp_target = torch.cat(hp_target,dim=0) 769 | pred_hm = sparse_hm_voxel.features.clone() 770 | loss_heatmap = self.loss_heatmap(clip_sigmoid(pred_hm), hp_target, avg_factor=max(hp_target.eq(1).float().sum().item(), 1)) 771 | # heatmap_target = torch.cat(hp_target, dim=0) 772 | # loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict[0]['dense_heatmap']), heatmap_target, avg_factor=max(heatmap_target.eq(1).float().sum().item(), 1)) 773 | loss_dict['loss_heatmap'] = loss_heatmap 774 | return loss_dict 775 | 776 | 777 | def sparse_hp_target_single(self,gt_bboxes_3d, gt_labels_3d, num_voxels, spatial_indices): 778 | num_max_objs = 500 779 | gaussian_overlap = 0.1 780 | min_radius = 2 781 | device = gt_labels_3d.device 782 | gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device) 783 | grid_size = torch.tensor(self.train_cfg['grid_size']) 784 | pc_range = torch.tensor(self.train_cfg['point_cloud_range']) 785 | voxel_size = torch.tensor(self.train_cfg['voxel_size']) 786 | feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len] 787 | # heatmap = gt_bboxes_3d.new_zeros((self.num_classes[0], feature_map_size[1], feature_map_size[0])) 788 | 789 | 790 | inds = gt_bboxes_3d.new_zeros(num_max_objs).long() 791 | mask = gt_bboxes_3d.new_zeros(num_max_objs).long() 792 | heatmap = gt_bboxes_3d.new_zeros(sum(self.num_classes), num_voxels) 793 | x, y, z = gt_bboxes_3d[:, 0], gt_bboxes_3d[:, 1], gt_bboxes_3d[:, 2] 794 | 795 | coord_x = (x - self.pc_range[0]) / voxel_size[0] / self.downsample_scale 796 | coord_y = (y - self.pc_range[1]) / voxel_size[1] / self.downsample_scale 797 | 798 | spatial_shape = [self.test_cfg['grid_size'][0] / self.downsample_scale, self.test_cfg['grid_size'][1] / self.downsample_scale] 799 | coord_x = torch.clamp(coord_x, min=0, max=spatial_shape[1] - 0.5) # bugfixed: 1e-6 does not work for center.int() 800 | coord_y = torch.clamp(coord_y, min=0, max=spatial_shape[0] - 0.5) # 801 | 802 | center = torch.cat((coord_y[:, None], coord_x[:, None]), dim=-1) 803 | center_int = center.int() 804 | center_int_float = center_int.float() 805 | 806 | dx, dy, dz = gt_bboxes_3d[:, 3], gt_bboxes_3d[:, 4], gt_bboxes_3d[:, 5] 807 | dx = dx / voxel_size[0] / self.downsample_scale 808 | dy = dy / voxel_size[1] / self.downsample_scale 809 | 810 | radius = self.gaussian_radius(dx, dy, min_overlap=gaussian_overlap) 811 | radius = torch.clamp_min(radius.int(), min=min_radius) 812 | 813 | for k in range(min(num_max_objs, gt_bboxes_3d.shape[0])): 814 | if dx[k] <= 0 or dy[k] <= 0: 815 | continue 816 | 817 | if not (0 <= center_int[k][0] <= spatial_shape[1] and 0 <= center_int[k][1] <= spatial_shape[0]): 818 | continue 819 | 820 | cur_class_id = (gt_labels_3d[k]).long() 821 | distance = self.distance(spatial_indices, center[k]) 822 | inds[k] = distance.argmin() 823 | mask[k] = 1 824 | 825 | # gt_center 826 | self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], distance, radius[k].item() * 1) 827 | 828 | # nearnest 829 | self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], self.distance(spatial_indices, spatial_indices[inds[k]]), radius[k].item() * 1) 830 | 831 | return [heatmap] 832 | 833 | def draw_gaussian_to_heatmap_voxels(self, heatmap, distances, radius, k=1): 834 | 835 | diameter = 2 * radius + 1 836 | sigma = diameter / 6 837 | masked_gaussian = torch.exp(- distances / (2 * sigma * sigma)) 838 | 839 | torch.max(heatmap, masked_gaussian, out=heatmap) 840 | 841 | return heatmap 842 | 843 | def distance(self, voxel_indices, center): 844 | distances = ((voxel_indices - center.unsqueeze(0))**2).sum(-1) 845 | return distances 846 | 847 | 848 | def _get_voxel_infos(self, x): 849 | spatial_shape = x.spatial_shape 850 | voxel_indices = x.indices 851 | spatial_indices = [] 852 | num_voxels = [] 853 | batch_size = x.batch_size 854 | batch_index = voxel_indices[:, 0] 855 | 856 | for bs_idx in range(batch_size): 857 | batch_inds = batch_index==bs_idx 858 | spatial_indices.append(voxel_indices[batch_inds][:, [1, 2]]) # y, x 859 | num_voxels.append(batch_inds.sum()) 860 | 861 | return spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels 862 | 863 | 864 | def gaussian_radius(self, height, width, min_overlap=0.5): 865 | """ 866 | Args: 867 | height: (N) 868 | width: (N) 869 | min_overlap: 870 | Returns: 871 | """ 872 | a1 = 1 873 | b1 = (height + width) 874 | c1 = width * height * (1 - min_overlap) / (1 + min_overlap) 875 | sq1 = (b1 ** 2 - 4 * a1 * c1).sqrt() 876 | r1 = (b1 + sq1) / 2 877 | 878 | a2 = 4 879 | b2 = 2 * (height + width) 880 | c2 = (1 - min_overlap) * width * height 881 | sq2 = (b2 ** 2 - 4 * a2 * c2).sqrt() 882 | r2 = (b2 + sq2) / 2 883 | 884 | a3 = 4 * min_overlap 885 | b3 = -2 * min_overlap * (height + width) 886 | c3 = (min_overlap - 1) * width * height 887 | sq3 = (b3 ** 2 - 4 * a3 * c3).sqrt() 888 | r3 = (b3 + sq3) / 2 889 | 890 | ret = torch.min(torch.min(r1, r2), r3) 891 | return ret 892 | 893 | def query_embed(self, ref_points, img_metas): 894 | ref_points = inverse_sigmoid(ref_points.clone()).sigmoid() 895 | bev_embeds = self._bev_query_embed(ref_points, img_metas) 896 | return bev_embeds 897 | 898 | 899 | def _get_targets_single(self, gt_bboxes_3d, gt_labels_3d, pred_bboxes, pred_logits): 900 | """"Compute regression and classification targets for one image. 901 | Outputs from a single decoder layer of a single feature level are used. 902 | Args: 903 | 904 | gt_bboxes_3d (Tensor): LiDARInstance3DBoxes(num_gts, 9) 905 | gt_labels_3d (Tensor): Ground truth class indices (num_gts, ) 906 | pred_bboxes (list[Tensor]): num_tasks x (num_query, 10) 907 | pred_logits (list[Tensor]): num_tasks x (num_query, task_classes) 908 | Returns: 909 | tuple[Tensor]: a tuple containing the following. 910 | - labels_tasks (list[Tensor]): num_tasks x (num_query, ). 911 | - label_weights_tasks (list[Tensor]): num_tasks x (num_query, ). 912 | - bbox_targets_tasks (list[Tensor]): num_tasks x (num_query, 9). 913 | - bbox_weights_tasks (list[Tensor]): num_tasks x (num_query, 10). 914 | - pos_inds (list[Tensor]): num_tasks x Sampled positive indices. 915 | - neg_inds (Tensor): num_tasks x Sampled negative indices. 916 | """ 917 | device = gt_labels_3d.device 918 | gt_bboxes_3d = torch.cat( 919 | (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1 920 | ).to(device) 921 | 922 | task_masks = [] 923 | flag = 0 924 | for class_name in self.class_names: 925 | task_masks.append([ 926 | torch.where(gt_labels_3d == class_name.index(i) + flag) 927 | for i in class_name 928 | ]) 929 | flag += len(class_name) 930 | 931 | task_boxes = [] 932 | task_classes = [] 933 | flag2 = 0 934 | for idx, mask in enumerate(task_masks): 935 | task_box = [] 936 | task_class = [] 937 | for m in mask: 938 | task_box.append(gt_bboxes_3d[m]) 939 | task_class.append(gt_labels_3d[m] - flag2) 940 | task_boxes.append(torch.cat(task_box, dim=0).to(device)) 941 | task_classes.append(torch.cat(task_class).long().to(device)) 942 | flag2 += len(mask) 943 | 944 | def task_assign(bbox_pred, logits_pred, gt_bboxes, gt_labels, num_classes): 945 | num_bboxes = bbox_pred.shape[0] 946 | assign_results = self.assigner.assign(bbox_pred, logits_pred, gt_bboxes, gt_labels) 947 | sampling_result = self.sampler.sample(assign_results, bbox_pred, gt_bboxes) 948 | pos_inds, neg_inds = sampling_result.pos_inds, sampling_result.neg_inds 949 | # label targets 950 | labels = gt_bboxes.new_full((num_bboxes, ), 951 | num_classes, 952 | dtype=torch.long) 953 | labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] 954 | label_weights = gt_bboxes.new_ones(num_bboxes) 955 | # bbox_targets 956 | code_size = gt_bboxes.shape[1] 957 | bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size] 958 | bbox_weights = torch.zeros_like(bbox_pred) 959 | bbox_weights[pos_inds] = 1.0 960 | 961 | if len(sampling_result.pos_gt_bboxes) > 0: 962 | bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes 963 | return labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds 964 | 965 | labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks\ 966 | = multi_apply(task_assign, pred_bboxes, pred_logits, task_boxes, task_classes, self.num_classes) 967 | 968 | return labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks 969 | 970 | def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits): 971 | """"Compute regression and classification targets for a batch image. 972 | Outputs from a single decoder layer of a single feature level are used. 973 | Args: 974 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9) 975 | gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, ) 976 | pred_bboxes (list[list[Tensor]]): batch_size x num_task x [num_query, 10]. 977 | pred_logits (list[list[Tensor]]): batch_size x num_task x [num_query, task_classes] 978 | Returns: 979 | tuple: a tuple containing the following targets. 980 | - task_labels_list (list(list[Tensor])): num_tasks x batch_size x (num_query, ). 981 | - task_labels_weight_list (list[Tensor]): num_tasks x batch_size x (num_query, ) 982 | - task_bbox_targets_list (list[Tensor]): num_tasks x batch_size x (num_query, 9) 983 | - task_bbox_weights_list (list[Tensor]): num_tasks x batch_size x (num_query, 10) 984 | - num_total_pos_tasks (list[int]): num_tasks x Number of positive samples 985 | - num_total_neg_tasks (list[int]): num_tasks x Number of negative samples. 986 | """ 987 | (labels_list, labels_weight_list, bbox_targets_list, 988 | bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( 989 | self._get_targets_single, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits 990 | ) 991 | task_num = len(labels_list[0]) 992 | num_total_pos_tasks, num_total_neg_tasks = [], [] 993 | task_labels_list, task_labels_weight_list, task_bbox_targets_list, \ 994 | task_bbox_weights_list = [], [], [], [] 995 | 996 | for task_id in range(task_num): 997 | num_total_pos_task = sum((inds[task_id].numel() for inds in pos_inds_list)) 998 | num_total_neg_task = sum((inds[task_id].numel() for inds in neg_inds_list)) 999 | num_total_pos_tasks.append(num_total_pos_task) 1000 | num_total_neg_tasks.append(num_total_neg_task) 1001 | task_labels_list.append([labels_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))]) 1002 | task_labels_weight_list.append([labels_weight_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))]) 1003 | task_bbox_targets_list.append([bbox_targets_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))]) 1004 | task_bbox_weights_list.append([bbox_weights_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))]) 1005 | 1006 | return (task_labels_list, task_labels_weight_list, task_bbox_targets_list, 1007 | task_bbox_weights_list, num_total_pos_tasks, num_total_neg_tasks) 1008 | 1009 | def _loss_single_task(self, 1010 | pred_bboxes, 1011 | pred_logits, 1012 | labels_list, 1013 | labels_weights_list, 1014 | bbox_targets_list, 1015 | bbox_weights_list, 1016 | num_total_pos, 1017 | num_total_neg): 1018 | """"Compute loss for single task. 1019 | Outputs from a single decoder layer of a single feature level are used. 1020 | Args: 1021 | pred_bboxes (Tensor): (batch_size, num_query, 10) 1022 | pred_logits (Tensor): (batch_size, num_query, task_classes) 1023 | labels_list (list[Tensor]): batch_size x (num_query, ) 1024 | labels_weights_list (list[Tensor]): batch_size x (num_query, ) 1025 | bbox_targets_list(list[Tensor]): batch_size x (num_query, 9) 1026 | bbox_weights_list(list[Tensor]): batch_size x (num_query, 10) 1027 | num_total_pos: int 1028 | num_total_neg: int 1029 | Returns: 1030 | loss_cls 1031 | loss_bbox 1032 | """ 1033 | labels = torch.cat(labels_list, dim=0) 1034 | labels_weights = torch.cat(labels_weights_list, dim=0) 1035 | bbox_targets = torch.cat(bbox_targets_list, dim=0) 1036 | bbox_weights = torch.cat(bbox_weights_list, dim=0) 1037 | 1038 | pred_bboxes_flatten = pred_bboxes.flatten(0, 1) 1039 | pred_logits_flatten = pred_logits.flatten(0, 1) 1040 | 1041 | cls_avg_factor = num_total_pos * 1.0 + num_total_neg * 0.1 1042 | cls_avg_factor = max(cls_avg_factor, 1) 1043 | loss_cls = self.loss_cls( 1044 | pred_logits_flatten, labels, labels_weights, avg_factor=cls_avg_factor 1045 | ) 1046 | 1047 | normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) 1048 | isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) 1049 | bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :] 1050 | 1051 | loss_bbox = self.loss_bbox( 1052 | pred_bboxes_flatten[isnotnan, :10], 1053 | normalized_bbox_targets[isnotnan, :10], 1054 | bbox_weights[isnotnan, :10], 1055 | avg_factor=num_total_pos 1056 | ) 1057 | 1058 | loss_cls = torch.nan_to_num(loss_cls) 1059 | loss_bbox = torch.nan_to_num(loss_bbox) 1060 | return loss_cls, loss_bbox 1061 | 1062 | def loss_single(self, 1063 | pred_bboxes, 1064 | pred_logits, 1065 | gt_bboxes_3d, 1066 | gt_labels_3d): 1067 | """"Loss function for outputs from a single decoder layer of a single 1068 | feature level. 1069 | Args: 1070 | pred_bboxes (list[Tensor]): num_tasks x [bs, num_query, 10]. 1071 | pred_logits (list(Tensor]): num_tasks x [bs, num_query, task_classes] 1072 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9) 1073 | gt_labels_list (list[Tensor]): Ground truth class indices. batch_size * (num_gts, ) 1074 | Returns: 1075 | dict[str, Tensor]: A dictionary of loss components for outputs from 1076 | a single decoder layer. 1077 | """ 1078 | batch_size = pred_bboxes[0].shape[0] 1079 | pred_bboxes_list, pred_logits_list = [], [] 1080 | for idx in range(batch_size): 1081 | pred_bboxes_list.append([task_pred_bbox[idx] for task_pred_bbox in pred_bboxes]) 1082 | pred_logits_list.append([task_pred_logits[idx] for task_pred_logits in pred_logits]) 1083 | cls_reg_targets = self.get_targets( 1084 | gt_bboxes_3d, gt_labels_3d, pred_bboxes_list, pred_logits_list 1085 | ) 1086 | (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, 1087 | num_total_pos, num_total_neg) = cls_reg_targets 1088 | loss_cls_tasks, loss_bbox_tasks = multi_apply( 1089 | self._loss_single_task, 1090 | pred_bboxes, 1091 | pred_logits, 1092 | labels_list, 1093 | label_weights_list, 1094 | bbox_targets_list, 1095 | bbox_weights_list, 1096 | num_total_pos, 1097 | num_total_neg 1098 | ) 1099 | 1100 | 1101 | return sum(loss_cls_tasks), sum(loss_bbox_tasks) 1102 | 1103 | def _dn_loss_single_task(self, 1104 | pred_bboxes, 1105 | pred_logits, 1106 | mask_dict): 1107 | known_labels, known_bboxs = mask_dict['known_lbs_bboxes'] 1108 | map_known_indice = mask_dict['map_known_indice'].long() 1109 | known_indice = mask_dict['known_indice'].long() 1110 | batch_idx = mask_dict['batch_idx'].long() 1111 | bid = batch_idx[known_indice] 1112 | known_labels_raw = mask_dict['known_labels_raw'] 1113 | 1114 | pred_logits = pred_logits[(bid, map_known_indice)] 1115 | pred_bboxes = pred_bboxes[(bid, map_known_indice)] 1116 | num_tgt = known_indice.numel() 1117 | 1118 | # filter task bbox 1119 | task_mask = known_labels_raw != pred_logits.shape[-1] 1120 | task_mask_sum = task_mask.sum() 1121 | 1122 | if task_mask_sum > 0: 1123 | # pred_logits = pred_logits[task_mask] 1124 | # known_labels = known_labels[task_mask] 1125 | pred_bboxes = pred_bboxes[task_mask] 1126 | known_bboxs = known_bboxs[task_mask] 1127 | 1128 | # classification loss 1129 | # construct weighted avg_factor to match with the official DETR repo 1130 | cls_avg_factor = num_tgt * 3.14159 / 6 * self.split * self.split * self.split 1131 | # if self.sync_cls_avg_factor: 1132 | # cls_avg_factor = reduce_mean( 1133 | # pred_logits.new_tensor([cls_avg_factor])) 1134 | 1135 | label_weights = torch.ones_like(known_labels) 1136 | cls_avg_factor = max(cls_avg_factor, 1) 1137 | loss_cls = self.loss_cls( 1138 | pred_logits, known_labels.long(), label_weights, avg_factor=cls_avg_factor) 1139 | 1140 | # Compute the average number of gt boxes accross all gpus, for 1141 | # normalization purposes 1142 | num_tgt = loss_cls.new_tensor([num_tgt]) 1143 | num_tgt = torch.clamp(reduce_mean(num_tgt), min=1).item() 1144 | 1145 | # regression L1 loss 1146 | normalized_bbox_targets = normalize_bbox(known_bboxs, self.pc_range) 1147 | isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) 1148 | bbox_weights = torch.ones_like(pred_bboxes) 1149 | bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :] 1150 | # bbox_weights[:, 6:8] = 0 1151 | loss_bbox = self.loss_bbox( 1152 | pred_bboxes[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_tgt) 1153 | 1154 | loss_cls = torch.nan_to_num(loss_cls) 1155 | loss_bbox = torch.nan_to_num(loss_bbox) 1156 | 1157 | if task_mask_sum == 0: 1158 | # loss_cls = loss_cls * 0.0 1159 | loss_bbox = loss_bbox * 0.0 1160 | 1161 | return self.dn_weight * loss_cls, self.dn_weight * loss_bbox 1162 | 1163 | def dn_loss_single(self, 1164 | pred_bboxes, 1165 | pred_logits, 1166 | dn_mask_dict): 1167 | loss_cls_tasks, loss_bbox_tasks = multi_apply( 1168 | self._dn_loss_single_task, pred_bboxes, pred_logits, dn_mask_dict 1169 | ) 1170 | return sum(loss_cls_tasks), sum(loss_bbox_tasks) 1171 | 1172 | @force_fp32(apply_to=('preds_dicts')) 1173 | def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False): 1174 | preds_dicts = self.bbox_coder.decode(preds_dicts) 1175 | num_samples = len(preds_dicts) 1176 | 1177 | ret_list = [] 1178 | for i in range(num_samples): 1179 | preds = preds_dicts[i] 1180 | bboxes = preds['bboxes'] 1181 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 1182 | bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1)) 1183 | scores = preds['scores'] 1184 | labels = preds['labels'] 1185 | ret_list.append([bboxes, scores, labels]) 1186 | return ret_list --------------------------------------------------------------------------------