├── projects
├── mmdet3d_plugin
│ ├── core
│ │ ├── __init__.py
│ │ └── bbox
│ │ │ ├── coders
│ │ │ ├── __init__.py
│ │ │ └── multi_task_bbox_coder.py
│ │ │ ├── assigners
│ │ │ ├── __init__.py
│ │ │ └── hungarian_assigner_3d.py
│ │ │ ├── match_costs
│ │ │ ├── __init__.py
│ │ │ └── match_cost.py
│ │ │ └── util.py
│ ├── mmcv_custom
│ │ ├── ops
│ │ │ ├── __init__.py
│ │ │ └── voxel
│ │ │ │ ├── __init__.py
│ │ │ │ └── spconv_voxelize.py
│ │ ├── runner
│ │ │ ├── __init__.py
│ │ │ └── hooks
│ │ │ │ ├── __init__.py
│ │ │ │ └── optimizer.py
│ │ └── __init__.py
│ ├── datasets
│ │ ├── __init__.py
│ │ └── custom_nuscenes_dataset.py
│ ├── models
│ │ ├── detectors
│ │ │ ├── __init__.py
│ │ │ └── fstr.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── cmt_transformer.py
│ │ │ └── petr_transformer.py
│ │ ├── backbones
│ │ │ ├── __init__.py
│ │ │ └── voxelnext.py
│ │ ├── __init__.py
│ │ └── dense_heads
│ │ │ ├── __init__.py
│ │ │ └── fstr_head.py
│ └── __init__.py
└── configs
│ └── lidar
│ ├── fstr_voxel0075_cbgs_20e.py
│ ├── fstr_large_voxel0075_cbgs_20e.py
│ └── fstr_xlarge_voxel0050_cbgs_20e.py
├── tools
├── dist_train.sh
├── dist_test.sh
├── test.py
└── train.py
├── .gitignore
├── README.md
└── LICENSE
/projects/mmdet3d_plugin/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxel import *
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/__init__.py:
--------------------------------------------------------------------------------
1 | from .hooks import *
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | from .runner import *
2 | from .ops import *
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .custom_nuscenes_dataset import CustomNuScenesDataset
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/__init__.py:
--------------------------------------------------------------------------------
1 | from .spconv_voxelize import SPConvVoxelization
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer import CustomFp16OptimizerHook
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .fstr import FSTRDetector
2 | __all__ = ['FSTRDetector']
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .cmt_transformer import *
2 | from .petr_transformer import *
3 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxelnext import VoxelNextEncoder
2 | __all__ = ['VoxelNextEncoder']
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_task_bbox_coder import MultiTaskBBoxCoder
2 |
3 | __all__ = ['MultiTaskBBoxCoder']
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py:
--------------------------------------------------------------------------------
1 | from .hungarian_assigner_3d import HungarianAssigner3D
2 |
3 | __all__ = ['HungarianAssigner3D']
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbones import *
2 | from .detectors import *
3 | from .dense_heads import *
4 | from .utils import *
5 |
6 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .fstr_head import (
2 | FSTRHead,
3 | SeparateTaskHead,
4 | )
5 |
6 | __all__ = ['SeparateTaskHead', 'FSTRHead']
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from .core.bbox.assigners import *
2 | from .core.bbox.coders import *
3 | from .core.bbox.match_costs import BBox3DL1Cost
4 | from .datasets import *
5 | from .mmcv_custom import *
6 | from .models import *
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py:
--------------------------------------------------------------------------------
1 | from mmdet.core.bbox.match_costs import build_match_cost
2 | from .match_cost import BBox3DL1Cost, BBoxBEVL1Cost, IoU3DCost
3 |
4 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'BBoxBEVL1Cost', 'IoU3DCost']
5 |
--------------------------------------------------------------------------------
/tools/dist_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | CONFIG=$1
4 | GPUS=$2
5 | NNODES=${NNODES:-1}
6 | NODE_RANK=${NODE_RANK:-0}
7 | PORT=${PORT:-29500}
8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
9 |
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 | --nnodes=$NNODES \
13 | --node_rank=$NODE_RANK \
14 | --master_addr=$MASTER_ADDR \
15 | --nproc_per_node=$GPUS \
16 | --master_port=$PORT \
17 | $(dirname "$0")/train.py \
18 | $CONFIG \
19 | --seed 0 \
20 | --launcher pytorch ${@:3}
21 |
--------------------------------------------------------------------------------
/tools/dist_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | CONFIG=$1
4 | CHECKPOINT=$2
5 | GPUS=$3
6 | NNODES=${NNODES:-1}
7 | NODE_RANK=${NODE_RANK:-0}
8 | PORT=${PORT:-29500}
9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 |
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python -m torch.distributed.launch \
13 | --nnodes=$NNODES \
14 | --node_rank=$NODE_RANK \
15 | --master_addr=$MASTER_ADDR \
16 | --nproc_per_node=$GPUS \
17 | --master_port=$PORT \
18 | $(dirname "$0")/test.py \
19 | $CONFIG \
20 | $CHECKPOINT \
21 | --launcher pytorch \
22 | ${@:4}
23 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/runner/hooks/optimizer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from mmcv.runner.hooks.optimizer import Fp16OptimizerHook
6 | from mmcv.runner.hooks import HOOKS
7 |
8 |
9 | @HOOKS.register_module()
10 | class CustomFp16OptimizerHook(Fp16OptimizerHook):
11 |
12 | def __init__(self,
13 | custom_fp16={},
14 | *args,
15 | **kwargs):
16 | super(CustomFp16OptimizerHook, self).__init__(*args, **kwargs)
17 | self.custom_fp16 = custom_fp16
18 |
19 | def before_run(self, runner) -> None:
20 | super().before_run(runner)
21 | for module_name, v in self.custom_fp16.items():
22 | runner.model.module._modules[module_name].fp16_enabled = v
23 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
3 |
4 |
5 | @MATCH_COST.register_module()
6 | class BBox3DL1Cost(object):
7 | """BBox3DL1Cost.
8 | Args:
9 | weight (int | float, optional): loss_weight
10 | """
11 |
12 | def __init__(self, weight=1.):
13 | self.weight = weight
14 |
15 | def __call__(self, bbox_pred, gt_bboxes):
16 | """
17 | Args:
18 | bbox_pred (Tensor): Predicted boxes with normalized coordinates
19 | (cx, cy, w, h), which are all in range [0, 1]. Shape
20 | [num_query, 4].
21 | gt_bboxes (Tensor): Ground truth boxes with normalized
22 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
23 | Returns:
24 | torch.Tensor: bbox_cost value with weight
25 | """
26 | bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
27 | return bbox_cost * self.weight
28 |
29 |
30 | @MATCH_COST.register_module()
31 | class BBoxBEVL1Cost(object):
32 | def __init__(self, weight):
33 | self.weight = weight
34 |
35 | def __call__(self, bboxes, gt_bboxes, pc_range):
36 | pc_start = bboxes.new(pc_range[0:2])
37 | pc_range = bboxes.new(pc_range[3:5]) - bboxes.new(pc_range[0:2])
38 | # normalize the box center to [0, 1]
39 | normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
40 | normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
41 | reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
42 | return reg_cost * self.weight
43 |
44 |
45 | @MATCH_COST.register_module()
46 | class IoU3DCost(object):
47 | def __init__(self, weight):
48 | self.weight = weight
49 |
50 | def __call__(self, iou):
51 | iou_cost = - iou
52 | return iou_cost * self.weight
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.ipynb
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | tmp/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 | .figs
30 |
31 | mmdetection3d/
32 | mmdetection3d
33 | mmdet3d
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | hostfile.txt
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | .hypothesis/
56 | .pytest_cache/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # celery beat schedule file
87 | celerybeat-schedule
88 |
89 | # SageMath parsed files
90 | *.sage.py
91 |
92 | # Environments
93 | .env
94 | .venv
95 | env/
96 | venv/
97 | ENV/
98 | env.bak/
99 | venv.bak/
100 |
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 |
105 | # Rope project settings
106 | .ropeproject
107 |
108 | # mkdocs documentation
109 | /site
110 |
111 | # mypy
112 | .mypy_cache/
113 |
114 | # cython generated cpp
115 | data
116 | ckpts
117 | .vscode
118 | .idea
119 |
120 | # custom
121 | nuscenes_gt_database
122 | nuscenes_unified_gt_database
123 | work_dirs
124 | *.pkl
125 | *.pkl.json
126 | *.log.json
127 | work_dirs/
128 | exps/
129 | *~
130 | mmdet3d/.mim
131 |
132 | # Pytorch
133 | *.pth
134 |
135 |
136 | # demo
137 | figs
138 |
139 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/util.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import numpy as np
4 | import mmdet3d
5 | from mmdet3d.core import limit_period
6 |
7 |
8 | def normalize_bbox(bboxes, pc_range=None):
9 |
10 | cx = bboxes[..., 0:1]
11 | cy = bboxes[..., 1:2]
12 | cz = bboxes[..., 2:3]
13 | w = bboxes[..., 3:4].log()
14 | l = bboxes[..., 4:5].log()
15 | h = bboxes[..., 5:6].log()
16 |
17 | rot = bboxes[..., 6:7]
18 | if bboxes.size(-1) > 7:
19 | vx = bboxes[..., 7:8]
20 | vy = bboxes[..., 8:9]
21 | normalized_bboxes = torch.cat(
22 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
23 | )
24 | else:
25 | normalized_bboxes = torch.cat(
26 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
27 | )
28 | return normalized_bboxes
29 |
30 |
31 | def denormalize_bbox(normalized_bboxes, pc_range=None):
32 | # rotation
33 | rot_sine = normalized_bboxes[..., 6:7]
34 |
35 | rot_cosine = normalized_bboxes[..., 7:8]
36 | rot = torch.atan2(rot_sine, rot_cosine)
37 |
38 | # center in the bev
39 | cx = normalized_bboxes[..., 0:1]
40 | cy = normalized_bboxes[..., 1:2]
41 | cz = normalized_bboxes[..., 4:5]
42 |
43 | # size
44 | w = normalized_bboxes[..., 2:3]
45 | l = normalized_bboxes[..., 3:4]
46 | h = normalized_bboxes[..., 5:6]
47 |
48 | w = w.exp()
49 | l = l.exp()
50 | h = h.exp()
51 |
52 | if normalized_bboxes.size(-1) > 8:
53 | # velocity
54 | vx = normalized_bboxes[..., 8:9]
55 | vy = normalized_bboxes[..., 9:10]
56 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
57 | else:
58 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
59 | return denormalized_bboxes
60 |
61 |
62 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical):
63 | """Map bboxes from testing scale to original image scale.
64 |
65 | Args:
66 | bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
67 | scale_factor (float): Scale factor.
68 | flip_horizontal (bool): Whether to flip horizontally.
69 | flip_vertical (bool): Whether to flip vertically.
70 |
71 | Returns:
72 | :obj:`BaseInstance3DBoxes`: Boxes mapped back.
73 | """
74 | new_bboxes = bboxes.clone()
75 | if flip_horizontal:
76 | new_bboxes.flip('horizontal')
77 | if flip_vertical:
78 | new_bboxes.flip('vertical')
79 | new_bboxes.scale(1 / scale_factor)
80 | new_bboxes.rotate(-rot_degree)
81 |
82 | return new_bboxes
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/mmcv_custom/ops/voxel/spconv_voxelize.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
2 |
3 | import numpy as np
4 | from torch import nn
5 | from spconv.pytorch.utils import PointToVoxel # spconv-cu111 2.1.21
6 | import torch
7 | import torch.nn.functional as F
8 | from torch.nn.modules.utils import _pair
9 |
10 |
11 | class SPConvVoxelization(nn.Module):
12 | def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels, num_point_features, device=torch.device("cuda")):
13 | super().__init__()
14 | assert len(voxel_size) == 3
15 | assert len(point_cloud_range) == 6
16 | self.voxel_size = np.array(voxel_size)
17 | self.point_cloud_range = np.array(point_cloud_range)
18 | self.max_num_points = max_num_points
19 | self.num_point_features = num_point_features
20 | self.device = device
21 | if isinstance(max_voxels, tuple):
22 | self.max_voxels = max_voxels
23 | else:
24 | self.max_voxels = _pair(max_voxels)
25 | self.voxel_generator = PointToVoxel(
26 | vsize_xyz=voxel_size,
27 | coors_range_xyz=point_cloud_range,
28 | max_num_points_per_voxel=max_num_points,
29 | max_num_voxels=self.max_voxels[0],
30 | num_point_features=num_point_features,
31 | device=device,
32 | )
33 | grid_size = (self.point_cloud_range[3:6] - self.point_cloud_range[0:3]) / np.array(voxel_size)
34 | self.grid_size = np.round(grid_size).astype(np.int64)
35 |
36 | def train(self, mode: bool = True):
37 | if mode:
38 | self.voxel_generator = PointToVoxel(
39 | vsize_xyz=self.voxel_size.tolist(),
40 | coors_range_xyz=self.point_cloud_range.tolist(),
41 | max_num_points_per_voxel=self.max_num_points,
42 | max_num_voxels=self.max_voxels[0],
43 | num_point_features=self.num_point_features,
44 | device=self.device,
45 | )
46 | else:
47 | self.voxel_generator = PointToVoxel(
48 | vsize_xyz=self.voxel_size.tolist(),
49 | coors_range_xyz=self.point_cloud_range.tolist(),
50 | max_num_points_per_voxel=self.max_num_points,
51 | max_num_voxels=self.max_voxels[1],
52 | num_point_features=self.num_point_features,
53 | device=self.device,
54 | )
55 |
56 | return super().train(mode)
57 |
58 | def forward(self, points):
59 | voxel_output = self.voxel_generator(points)
60 | voxels, coordinates, num_points = voxel_output
61 | return torch.clone(voxels), torch.clone(coordinates), torch.clone(num_points)
62 |
63 | def __repr__(self):
64 | tmpstr = self.__class__.__name__ + '('
65 | tmpstr += 'voxel_size=' + str(self.voxel_size)
66 | tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
67 | tmpstr += ', max_num_points=' + str(self.max_num_points)
68 | tmpstr += ', max_voxels=' + str(self.max_voxels)
69 | tmpstr += ', num_point_features=' + str(self.num_point_features)
70 | tmpstr += ')'
71 | return tmpstr
72 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Fully Sparse Transformer 3D Detector for LiDAR Point Cloud
3 |
4 | [Paper](https://ieeexplore.ieee.org/document/10302363), [nuScenes LeaderBoard](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Lidar)
5 |
6 |
7 |
8 | All statistics are measured on a single Tesla A100 GPU using the best model of official repositories. Some sparse module in the model are supported.
9 |
10 |
11 | FSTR is a fully sparse LiDAR-based detector that achieves better accuracy-efficient trade-off compare with other popular LiDAR-based detectors. A lightweight DETR-like framework with signle decoder layer is designed for lidar-only detection, which obtains **73.6%** NDS (**FSTR-XLarge with TTA**) on nuScenes benchmark and **31.5%** CDS (**FSTR-Large**) on Argoverse2 validation dataset.
12 |
13 | ## Currently Supported Features
14 | - [x] Support nuScenes dataset
15 | - [ ] Support Argoverse2 dataset
16 | ## Preparation
17 |
18 | * Environments
19 | Python == 3.8 \
20 | CUDA == 11.1 \
21 | pytorch == 1.9.0 \
22 | mmcv-full == 1.6.0 \
23 | mmdet == 2.24.0 \
24 | mmsegmentation == 0.29.1 \
25 | mmdet3d == 1.0.0rc5 \
26 | [flash-attn](https://github.com/HazyResearch/flash-attention) == 0.2.2 \
27 | [Spconv-plus](https://github.com/dvlab-research/spconv-plus) == 2.1.21
28 |
29 | * Data
30 | Follow the [mmdet3d](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/data_preparation.md) to process the nuScenes dataset.
31 |
32 | ## Train & inference
33 | ```bash
34 | # train
35 | bash tools/dist_train.sh /path_to_your_config 8
36 | # inference
37 | bash tools/dist_test.sh /path_to_your_config /path_to_your_pth 8 --eval bbox
38 | ```
39 | ## Main Results
40 | Results on nuScenes **val set**. The default batch size is 2 on each GPU. The FPS are all evaluated with a single Tesla A100 GPU. (15e + 5e means the last 5 epochs should be trained without [GTsample](https://github.com/Poley97/FSTR/blob/master/projects/configs/lidar/fstr_voxel0075_cbgs_20e.py.py#L33-L69))
41 |
42 | | Config | mAP | NDS | Schedule|Inference FPS|
43 | |:--------:|:----------:|:---------:|:--------:|:--------:|
44 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 64.2% | 69.1% | 15e+5e | 15.4 |
45 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) | 65.5% | 70.3% | 15e+5e | 9.5 |
46 |
47 |
48 | Results on nuScenes **test set**. To reproduce our result, replace `ann_file=data_root + '/nuscenes_infos_train.pkl'` in [training config](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) with `ann_file=[data_root + '/nuscenes_infos_train.pkl', data_root + '/nuscenes_infos_val.pkl']`:
49 |
50 | | Config | mAP | NDS | Schedule|Inference FPS|
51 | |:--------:|:----------:|:---------:|:--------:|:--------:|
52 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) | 66.2% | 70.4% | 15e+5e | 15.4 |
53 | | [FSTR](./projects/configs/lidar/fstr_voxel0075_cbgs_20e.py) +TTA | 67.6% | 71.5% | 15e+5e | - |
54 | | [FSTR-Large](./projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py) + TTA | 69.5% | 73.0% | 15e+5e | - |
55 | | [FSTR-XLarge](./projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py) + TTA | 70.2% | 73.5% | 15e+5e | - |
56 |
57 | ## Citation
58 | If you find our FSTR helpful in your research, please consider citing:
59 | ```bibtex
60 | @article{zhang2023fully,
61 | title={Fully Sparse Transformer 3D Detector for LiDAR Point Cloud},
62 | author={Zhang, Diankun and Zheng, Zhijie and Niu, Haoyu and Wang, Xueqing and Liu, Xiaojun},
63 | journal={IEEE Transactions on Geoscience and Remote Sensing},
64 | year={2023},
65 | publisher={IEEE}
66 | }
67 | ```
68 |
69 | ## Contact
70 | If you have any questions, feel free to open an issue or contact us at zhangdiankun19@mails.ucas.edu.cn, or tanfeiyang@megvii.com.
71 |
72 | ## Acknowledgement
73 | Parts of our Code refer to the the recent work [CMT](https://github.com/junjie18/CMT).
74 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/datasets/custom_nuscenes_dataset.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
5 | # Copyright (c) 2021 Wang, Yue
6 | # ------------------------------------------------------------------------
7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
8 | # Copyright (c) OpenMMLab. All rights reserved.
9 | # ------------------------------------------------------------------------
10 |
11 | import numpy as np
12 | from mmdet.datasets import DATASETS
13 | from mmdet3d.datasets import NuScenesDataset
14 |
15 |
16 | @DATASETS.register_module()
17 | class CustomNuScenesDataset(NuScenesDataset):
18 | r"""NuScenes Dataset.
19 |
20 | This datset only add camera intrinsics and extrinsics to the results.
21 | """
22 |
23 | def __init__(self, *args, return_gt_info=False, **kwargs):
24 | super(CustomNuScenesDataset, self).__init__(*args, **kwargs)
25 | self.return_gt_info = return_gt_info
26 |
27 | def get_data_info(self, index):
28 | """Get data info according to the given index.
29 |
30 | Args:
31 | index (int): Index of the sample data to get.
32 |
33 | Returns:
34 | dict: Data information that will be passed to the data \
35 | preprocessing pipelines. It includes the following keys:
36 |
37 | - sample_idx (str): Sample index.
38 | - pts_filename (str): Filename of point clouds.
39 | - sweeps (list[dict]): Infos of sweeps.
40 | - timestamp (float): Sample timestamp.
41 | - img_filename (str, optional): Image filename.
42 | - lidar2img (list[np.ndarray], optional): Transformations \
43 | from lidar to different cameras.
44 | - ann_info (dict): Annotation info.
45 | """
46 | info = self.data_infos[index]
47 | # standard protocal modified from SECOND.Pytorch
48 | input_dict = dict(
49 | sample_idx=info['token'],
50 | pts_filename=info['lidar_path'],
51 | sweeps=info['sweeps'],
52 | timestamp=info['timestamp'] / 1e6,
53 | img_sweeps=None if 'img_sweeps' not in info else info['img_sweeps'],
54 | radar_info=None if 'radars' not in info else info['radars']
55 | )
56 |
57 | if self.return_gt_info:
58 | input_dict['info'] = info
59 |
60 | if self.modality['use_camera']:
61 | image_paths = []
62 | lidar2img_rts = []
63 | lidar2cam_rts = []
64 | cam_intrinsics = []
65 | img_timestamp = []
66 | for cam_type, cam_info in info['cams'].items():
67 | img_timestamp.append(cam_info['timestamp'] / 1e6)
68 | image_paths.append(cam_info['data_path'])
69 | # obtain lidar to image transformation matrix
70 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
71 | lidar2cam_t = cam_info[
72 | 'sensor2lidar_translation'] @ lidar2cam_r.T
73 | lidar2cam_rt = np.eye(4)
74 | lidar2cam_rt[:3, :3] = lidar2cam_r.T
75 | lidar2cam_rt[3, :3] = -lidar2cam_t
76 | intrinsic = cam_info['cam_intrinsic']
77 | viewpad = np.eye(4)
78 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
79 | lidar2img_rt = (viewpad @ lidar2cam_rt.T)
80 | lidar2img_rts.append(lidar2img_rt)
81 |
82 | cam_intrinsics.append(viewpad)
83 | lidar2cam_rts.append(lidar2cam_rt.T)
84 |
85 | input_dict.update(
86 | dict(
87 | img_timestamp=img_timestamp,
88 | img_filename=image_paths,
89 | lidar2img=lidar2img_rts,
90 | cam_intrinsic=cam_intrinsics,
91 | lidar2cam=lidar2cam_rts,
92 | ))
93 |
94 | if not self.test_mode:
95 | annos = self.get_ann_info(index)
96 | input_dict['ann_info'] = annos
97 |
98 | return input_dict
99 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/coders/multi_task_bbox_coder.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
5 | # Copyright (c) OpenMMLab. All rights reserved.
6 | # ------------------------------------------------------------------------
7 |
8 | import torch
9 |
10 | from mmdet.core.bbox import BaseBBoxCoder
11 | from mmdet.core.bbox.builder import BBOX_CODERS
12 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
13 |
14 |
15 | @BBOX_CODERS.register_module()
16 | class MultiTaskBBoxCoder(BaseBBoxCoder):
17 | """Bbox coder for NMS-free detector.
18 | Args:
19 | pc_range (list[float]): Range of point cloud.
20 | post_center_range (list[float]): Limit of the center.
21 | Default: None.
22 | max_num (int): Max number to be kept. Default: 100.
23 | score_threshold (float): Threshold to filter boxes based on score.
24 | Default: None.
25 | code_size (int): Code size of bboxes. Default: 9
26 | """
27 |
28 | def __init__(self,
29 | pc_range,
30 | voxel_size=None,
31 | post_center_range=None,
32 | max_num=100,
33 | score_threshold=None,
34 | num_classes=10):
35 |
36 | self.pc_range = pc_range
37 | self.voxel_size = voxel_size
38 | self.post_center_range = post_center_range
39 | self.max_num = max_num
40 | self.score_threshold = score_threshold
41 | self.num_classes = num_classes
42 |
43 | def encode(self):
44 | pass
45 |
46 | def decode_single(self, cls_scores, bbox_preds, task_ids):
47 | """Decode bboxes.
48 | Args:
49 | cls_scores (Tensor): Outputs from the classification head, \
50 | shape [num_query, cls_out_channels]. Note \
51 | cls_out_channels should includes background.
52 | bbox_preds (Tensor): Outputs from the regression \
53 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
54 | Shape [num_query, 9].
55 | Returns:
56 | list[dict]: Decoded boxes.
57 | """
58 | max_num = self.max_num
59 | num_query = cls_scores.shape[0]
60 |
61 | cls_scores = cls_scores.sigmoid()
62 | scores, indexs = cls_scores.view(-1).topk(max_num)
63 | labels = indexs % self.num_classes
64 | bbox_index = indexs // self.num_classes
65 | task_index = torch.gather(task_ids, 1, labels.unsqueeze(1)).squeeze()
66 |
67 | bbox_preds = bbox_preds[task_index * num_query + bbox_index]
68 |
69 | final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
70 | final_scores = scores
71 | final_preds = labels
72 |
73 | # use score threshold
74 | if self.score_threshold is not None:
75 | thresh_mask = final_scores > self.score_threshold
76 | if self.post_center_range is not None:
77 | self.post_center_range = torch.tensor(
78 | self.post_center_range, device=scores.device)
79 | mask = (final_box_preds[..., :3] >=
80 | self.post_center_range[:3]).all(1)
81 | mask &= (final_box_preds[..., :3] <=
82 | self.post_center_range[3:]).all(1)
83 |
84 | if self.score_threshold:
85 | mask &= thresh_mask
86 |
87 | boxes3d = final_box_preds[mask]
88 | scores = final_scores[mask]
89 | labels = final_preds[mask]
90 | predictions_dict = {
91 | 'bboxes': boxes3d,
92 | 'scores': scores,
93 | 'labels': labels
94 | }
95 |
96 | else:
97 | raise NotImplementedError(
98 | 'Need to reorganize output as a batch, only '
99 | 'support post_center_range is not None for now!')
100 | return predictions_dict
101 |
102 | def decode(self, preds_dicts):
103 | """Decode bboxes.
104 | Args:
105 | all_cls_scores (Tensor): Outputs from the classification head, \
106 | shape [nb_dec, bs, num_query, cls_out_channels]. Note \
107 | cls_out_channels should includes background.
108 | all_bbox_preds (Tensor): Sigmoid outputs from the regression \
109 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
110 | Shape [nb_dec, bs, num_query, 9].
111 | Returns:
112 | list[dict]: Decoded boxes.
113 | """
114 | task_num = len(preds_dicts)
115 |
116 | pred_bbox_list, pred_logits_list, task_ids_list = [], [], []
117 | for task_id in range(task_num):
118 | task_pred_dict = preds_dicts[task_id][0]
119 | task_pred_bbox = torch.cat(
120 | (task_pred_dict['center'][-1], task_pred_dict['height'][-1],
121 | task_pred_dict['dim'][-1], task_pred_dict['rot'][-1],
122 | task_pred_dict['vel'][-1]),
123 | dim=-1
124 | )
125 | task_pred_logits = task_pred_dict['cls_logits'][-1]
126 | pred_bbox_list.append(task_pred_bbox)
127 | pred_logits_list.append(task_pred_logits)
128 |
129 | task_ids = task_pred_logits.new_ones(task_pred_logits.shape).int() * task_id
130 | task_ids_list.append(task_ids)
131 |
132 |
133 | all_pred_logits = torch.cat(pred_logits_list, dim=-1) # bs * nq * 10
134 | all_pred_bbox = torch.cat(pred_bbox_list, dim=1) # bs * (task nq) * 10
135 | all_task_ids = torch.cat(task_ids_list, dim=-1) # bs * nq * 10
136 |
137 | batch_size = all_pred_logits.shape[0]
138 | predictions_list = []
139 | for i in range(batch_size):
140 | predictions_list.append(
141 | self.decode_single(all_pred_logits[i], all_pred_bbox[i], all_task_ids[i]))
142 | return predictions_list
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
2 |
3 | import math
4 | import torch
5 | import torch.nn as nn
6 | from torch.nn.init import (
7 | xavier_uniform_,
8 | constant_,
9 | xavier_normal_
10 | )
11 | from torch.nn.functional import linear
12 |
13 | from einops import rearrange
14 | from mmcv.runner import auto_fp16
15 | from mmcv.runner.base_module import BaseModule
16 |
17 | from flash_attn.flash_attn_interface import flash_attn_unpadded_kvpacked_func
18 | from flash_attn.bert_padding import unpad_input, pad_input, index_first_axis
19 |
20 |
21 | def _in_projection_packed(q, k, v, w, b = None):
22 | w_q, w_k, w_v = w.chunk(3)
23 | if b is None:
24 | b_q = b_k = b_v = None
25 | else:
26 | b_q, b_k, b_v = b.chunk(3)
27 | return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
28 |
29 |
30 | class FlashAttention(nn.Module):
31 | """Implement the scaled dot product attention with softmax.
32 | Arguments
33 | ---------
34 | softmax_scale: The temperature to use for the softmax attention.
35 | (default: 1/sqrt(d_keys) where d_keys is computed at
36 | runtime)
37 | attention_dropout: The dropout rate to apply to the attention
38 | (default: 0.1)
39 | """
40 | def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
41 | super().__init__()
42 | self.softmax_scale = softmax_scale
43 | self.dropout_p = attention_dropout
44 | self.fp16_enabled = True
45 |
46 | @auto_fp16(apply_to=('q', 'kv'), out_fp32=True)
47 | def forward(self, q, kv,
48 | causal=False,
49 | key_padding_mask=None):
50 | """Implements the multihead softmax attention.
51 | Arguments
52 | ---------
53 | q: The tensor containing the query. (B, T, H, D)
54 | kv: The tensor containing the key, and value. (B, S, 2, H, D)
55 | key_padding_mask: a bool tensor of shape (B, S)
56 | """
57 | assert q.dtype in [torch.float16, torch.bfloat16] and kv.dtype in [torch.float16, torch.bfloat16]
58 | assert q.is_cuda and kv.is_cuda
59 | assert q.shape[0] == kv.shape[0] and q.shape[-2] == kv.shape[-2] and q.shape[-1] == kv.shape[-1]
60 |
61 | batch_size = q.shape[0]
62 | seqlen_q, seqlen_k = q.shape[1], kv.shape[1]
63 | if key_padding_mask is None:
64 | q, kv = rearrange(q, 'b s ... -> (b s) ...'), rearrange(kv, 'b s ... -> (b s) ...')
65 | max_sq, max_sk = seqlen_q, seqlen_k
66 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
67 | device=q.device)
68 | cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
69 | device=kv.device)
70 | output = flash_attn_unpadded_kvpacked_func(
71 | q, kv, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
72 | self.dropout_p if self.training else 0.0,
73 | softmax_scale=self.softmax_scale, causal=causal
74 | )
75 | output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
76 | else:
77 | nheads = kv.shape[-2]
78 | q = rearrange(q, 'b s ... -> (b s) ...')
79 | max_sq = seqlen_q
80 | cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
81 | device=q.device)
82 | x = rearrange(kv, 'b s two h d -> b s (two h d)')
83 | x_unpad, indices, cu_seqlens_k, max_sk = unpad_input(x, key_padding_mask)
84 | x_unpad = rearrange(x_unpad, 'nnz (two h d) -> nnz two h d', two=2, h=nheads)
85 | output_unpad = flash_attn_unpadded_kvpacked_func(
86 | q, x_unpad, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk,
87 | self.dropout_p if self.training else 0.0,
88 | softmax_scale=self.softmax_scale, causal=causal
89 | )
90 | output = rearrange(output_unpad, '(b s) ... -> b s ...', b=batch_size)
91 |
92 | return output, None
93 |
94 |
95 | class FlashMHA(nn.Module):
96 |
97 | def __init__(self, embed_dim, num_heads, bias=True, batch_first=True, attention_dropout=0.0,
98 | causal=False, device=None, dtype=None, **kwargs) -> None:
99 | assert batch_first
100 | factory_kwargs = {'device': device, 'dtype': dtype}
101 | super().__init__()
102 | self.embed_dim = embed_dim
103 | self.causal = causal
104 | self.bias = bias
105 |
106 | self.num_heads = num_heads
107 | assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
108 | self.head_dim = self.embed_dim // num_heads
109 | assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
110 |
111 | self.in_proj_weight = nn.Parameter(torch.empty((3 * embed_dim, embed_dim)))
112 | if bias:
113 | self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
114 | else:
115 | self.register_parameter('in_proj_bias', None)
116 | self.inner_attn = FlashAttention(attention_dropout=attention_dropout, **factory_kwargs)
117 | self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
118 | self._reset_parameters()
119 |
120 | def _reset_parameters(self) -> None:
121 | xavier_uniform_(self.in_proj_weight)
122 | if self.in_proj_bias is not None:
123 | constant_(self.in_proj_bias, 0.)
124 | constant_(self.out_proj.bias, 0.)
125 |
126 | def forward(self, q, k, v, key_padding_mask=None):
127 | """x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim)
128 | key_padding_mask: bool tensor of shape (batch, seqlen)
129 | """
130 | # q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)
131 | q, k, v = _in_projection_packed(q, k, v, self.in_proj_weight, self.in_proj_bias)
132 | q = rearrange(q, 'b s (h d) -> b s h d', h=self.num_heads)
133 | k = rearrange(k, 'b s (h d) -> b s h d', h=self.num_heads)
134 | v = rearrange(v, 'b s (h d) -> b s h d', h=self.num_heads)
135 | kv = torch.stack([k, v], dim=2)
136 |
137 | context, attn_weights = self.inner_attn(q, kv, key_padding_mask=key_padding_mask, causal=self.causal)
138 | return self.out_proj(rearrange(context, 'b s h d -> b s (h d)')), attn_weights
139 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2023 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
5 | # Copyright (c) 2021 Wang, Yue
6 | # ------------------------------------------------------------------------
7 | # Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
8 | # Copyright (c) OpenMMLab. All rights reserved.
9 | # ------------------------------------------------------------------------
10 |
11 | import torch
12 |
13 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS
14 | from mmdet.core.bbox.assigners import AssignResult
15 | from mmdet.core.bbox.assigners import BaseAssigner
16 | from mmdet.core.bbox.match_costs import build_match_cost
17 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
18 | from mmdet.core.bbox.iou_calculators import build_iou_calculator
19 | from mmdet.models.utils.transformer import inverse_sigmoid
20 | from scipy.optimize import linear_sum_assignment
21 |
22 | from projects.mmdet3d_plugin.core.bbox.util import (
23 | normalize_bbox,
24 | denormalize_bbox
25 | )
26 |
27 |
28 | @BBOX_ASSIGNERS.register_module()
29 | class HungarianAssigner3D(BaseAssigner):
30 | """Computes one-to-one matching between predictions and ground truth.
31 | This class computes an assignment between the targets and the predictions
32 | based on the costs. The costs are weighted sum of three components:
33 | classification cost, regression L1 cost and regression iou cost. The
34 | targets don't include the no_object, so generally there are more
35 | predictions than targets. After the one-to-one matching, the un-matched
36 | are treated as backgrounds. Thus each query prediction will be assigned
37 | with `0` or a positive integer indicating the ground truth index:
38 | - 0: negative sample, no assigned gt
39 | - positive integer: positive sample, index (1-based) of assigned gt
40 | Args:
41 | cls_weight (int | float, optional): The scale factor for classification
42 | cost. Default 1.0.
43 | bbox_weight (int | float, optional): The scale factor for regression
44 | L1 cost. Default 1.0.
45 | iou_weight (int | float, optional): The scale factor for regression
46 | iou cost. Default 1.0.
47 | iou_calculator (dict | optional): The config for the iou calculation.
48 | Default type `BboxOverlaps2D`.
49 | iou_mode (str | optional): "iou" (intersection over union), "iof"
50 | (intersection over foreground), or "giou" (generalized
51 | intersection over union). Default "giou".
52 | """
53 |
54 | def __init__(self,
55 | cls_cost=dict(type='ClassificationCost', weight=1.),
56 | reg_cost=dict(type='BBoxL1Cost', weight=1.0),
57 | iou_cost=dict(type='IoUCost', weight=0.0),
58 | pc_range=None,
59 | code_weights=None):
60 | self.cls_cost = build_match_cost(cls_cost)
61 | self.reg_cost = build_match_cost(reg_cost)
62 | self.iou_cost = build_match_cost(iou_cost)
63 | self.pc_range = pc_range
64 | self.code_weights = code_weights
65 | if self.code_weights:
66 | self.code_weights = torch.tensor(self.code_weights)[None, :].cuda()
67 |
68 | def assign(self,
69 | bbox_pred,
70 | cls_pred,
71 | gt_bboxes,
72 | gt_labels,
73 | gt_bboxes_ignore=None,
74 | eps=1e-7,
75 | code_weights=None):
76 | """Computes one-to-one matching based on the weighted costs.
77 | This method assign each query prediction to a ground truth or
78 | background. The `assigned_gt_inds` with -1 means don't care,
79 | 0 means negative sample, and positive number is the index (1-based)
80 | of assigned gt.
81 | The assignment is done in the following steps, the order matters.
82 | 1. assign every prediction to -1
83 | 2. compute the weighted costs
84 | 3. do Hungarian matching on CPU based on the costs
85 | 4. assign all to 0 (background) first, then for each matched pair
86 | between predictions and gts, treat this prediction as foreground
87 | and assign the corresponding gt index (plus 1) to it.
88 | Args:
89 | bbox_pred (Tensor): Predicted boxes with normalized coordinates
90 | (cx, cy, w, h), which are all in range [0, 1]. Shape
91 | [num_query, 4].
92 | cls_pred (Tensor): Predicted classification logits, shape
93 | [num_query, num_class].
94 | gt_bboxes (Tensor): Ground truth boxes with unnormalized
95 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
96 | gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
97 | gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
98 | labelled as `ignored`. Default None.
99 | eps (int | float, optional): A value added to the denominator for
100 | numerical stability. Default 1e-7.
101 | Returns:
102 | :obj:`AssignResult`: The assigned result.
103 | """
104 | assert gt_bboxes_ignore is None, \
105 | 'Only case when gt_bboxes_ignore is None is supported.'
106 | num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
107 |
108 | # 1. assign -1 by default
109 | assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
110 | -1,
111 | dtype=torch.long)
112 | assigned_labels = bbox_pred.new_full((num_bboxes, ),
113 | -1,
114 | dtype=torch.long)
115 | if num_gts == 0 or num_bboxes == 0:
116 | # No ground truth or boxes, return empty assignment
117 | if num_gts == 0:
118 | # No ground truth, assign all to background
119 | assigned_gt_inds[:] = 0
120 | return AssignResult(
121 | num_gts, assigned_gt_inds, None, labels=assigned_labels)
122 |
123 | # 2. compute the weighted costs
124 | # classification and bboxcost.
125 | cls_cost = self.cls_cost(cls_pred, gt_labels)
126 | # regression L1 cost
127 | normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
128 |
129 | if self.code_weights is not None:
130 | bbox_pred = bbox_pred * self.code_weights
131 | normalized_gt_bboxes = normalized_gt_bboxes * self.code_weights
132 |
133 | reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
134 |
135 | # weighted sum of above two costs
136 | cost = cls_cost + reg_cost
137 |
138 | # 3. do Hungarian matching on CPU using linear_sum_assignment
139 | cost = cost.detach().cpu()
140 | if linear_sum_assignment is None:
141 | raise ImportError('Please run "pip install scipy" '
142 | 'to install scipy first.')
143 | matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
144 | matched_row_inds = torch.from_numpy(matched_row_inds).to(
145 | bbox_pred.device)
146 | matched_col_inds = torch.from_numpy(matched_col_inds).to(
147 | bbox_pred.device)
148 |
149 | # 4. assign backgrounds and foregrounds
150 | # assign all indices to backgrounds first
151 | assigned_gt_inds[:] = 0
152 | # assign foregrounds based on matching results
153 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
154 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
155 | return AssignResult(
156 | num_gts, assigned_gt_inds, None, labels=assigned_labels)
157 |
158 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/detectors/fstr.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
5 | # Copyright (c) OpenMMLab. All rights reserved.
6 | # ------------------------------------------------------------------------
7 |
8 | import torch
9 | import torch.nn.functional as F
10 |
11 | from mmcv.runner import force_fp32
12 | from mmdet.models import DETECTORS
13 | from mmdet3d.core import bbox3d2result
14 | from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
15 |
16 | from projects.mmdet3d_plugin import SPConvVoxelization
17 |
18 | @DETECTORS.register_module()
19 | class FSTRDetector(MVXTwoStageDetector):
20 |
21 | def __init__(self,
22 | **kwargs):
23 | pts_voxel_cfg = kwargs.get('pts_voxel_layer', None)
24 | kwargs['pts_voxel_layer'] = None
25 | super(FSTRDetector, self).__init__(**kwargs)
26 | if pts_voxel_cfg:
27 | self.pts_voxel_layer = SPConvVoxelization(**pts_voxel_cfg)
28 |
29 | def init_weights(self):
30 | """Initialize model weights."""
31 | super(FSTRDetector, self).init_weights()
32 |
33 | def extract_feat(self, points, img_metas):
34 | """Extract features from images and points."""
35 | pts_feats = self.extract_pts_feat(points, img_metas)
36 | return pts_feats
37 |
38 | @force_fp32(apply_to=('pts'))
39 | def extract_pts_feat(self, pts, img_metas):
40 | """Extract features of points."""
41 | if not self.with_pts_bbox:
42 | return None
43 | if pts is None:
44 | return None
45 | voxels, num_points, coors = self.voxelize(pts)
46 | voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
47 | )
48 | batch_size = coors[-1, 0] + 1
49 | x = self.pts_middle_encoder(voxel_features, coors, batch_size)
50 | return x
51 |
52 | @torch.no_grad()
53 | @force_fp32()
54 | def voxelize(self, points):
55 | """Apply dynamic voxelization to points.
56 |
57 | Args:
58 | points (list[torch.Tensor]): Points of each sample.
59 |
60 | Returns:
61 | tuple[torch.Tensor]: Concatenated points, number of points
62 | per voxel, and coordinates.
63 | """
64 | voxels, coors, num_points = [], [], []
65 | for res in points:
66 | res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
67 | voxels.append(res_voxels)
68 | coors.append(res_coors)
69 | num_points.append(res_num_points)
70 | voxels = torch.cat(voxels, dim=0)
71 | num_points = torch.cat(num_points, dim=0)
72 | coors_batch = []
73 | for i, coor in enumerate(coors):
74 | coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
75 | coors_batch.append(coor_pad)
76 | coors_batch = torch.cat(coors_batch, dim=0)
77 | return voxels, num_points, coors_batch
78 |
79 | def forward_train(self,
80 | points=None,
81 | img_metas=None,
82 | gt_bboxes_3d=None,
83 | gt_labels_3d=None,
84 | gt_labels=None,
85 | gt_bboxes=None,
86 | proposals=None,
87 | gt_bboxes_ignore=None,
88 | **kwargs):
89 | """Forward training function.
90 |
91 | Args:
92 | points (list[torch.Tensor], optional): Points of each sample.
93 | Defaults to None.
94 | img_metas (list[dict], optional): Meta information of each sample.
95 | Defaults to None.
96 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
97 | Ground truth 3D boxes. Defaults to None.
98 | gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
99 | of 3D boxes. Defaults to None.
100 | gt_labels (list[torch.Tensor], optional): Ground truth labels
101 | of 2D boxes in images. Defaults to None.
102 | gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
103 | images. Defaults to None.
104 | img (torch.Tensor optional): Images of each sample with shape
105 | (N, C, H, W). Defaults to None.
106 | proposals ([list[torch.Tensor], optional): Predicted proposals
107 | used for training Fast RCNN. Defaults to None.
108 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
109 | 2D boxes in images to be ignored. Defaults to None.
110 |
111 | Returns:
112 | dict: Losses of different branches.
113 | """
114 | # nvtx.range_push('forward')
115 | # nvtx.range_push('voxel_backbone')
116 | pts_feats = self.extract_feat(
117 | points=points, img_metas=img_metas)
118 | # nvtx.range_pop()
119 | # nvtx.range_push('fstr_head')
120 | losses = dict()
121 | if pts_feats :
122 | losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
123 | gt_labels_3d, img_metas,
124 | gt_bboxes_ignore)
125 | losses.update(losses_pts)
126 | # nvtx.range_pop()
127 | # nvtx.range_pop()
128 | return losses
129 |
130 | @force_fp32(apply_to=('pts_feats'))
131 | def forward_pts_train(self,
132 | pts_feats,
133 | gt_bboxes_3d,
134 | gt_labels_3d,
135 | img_metas,
136 | gt_bboxes_ignore=None,
137 | ):
138 | """Forward function for point cloud branch.
139 |
140 | Args:
141 | pts_feats (list[torch.Tensor]): Features of point cloud branch
142 | gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
143 | boxes for each sample.
144 | gt_labels_3d (list[torch.Tensor]): Ground truth labels for
145 | boxes of each sampole
146 | img_metas (list[dict]): Meta information of samples.
147 | gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
148 | boxes to be ignored. Defaults to None.
149 |
150 | Returns:
151 | dict: Losses of each branch.
152 | """
153 | outs = self.pts_bbox_head(pts_feats, img_metas)
154 | loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
155 | losses = self.pts_bbox_head.loss(*loss_inputs)
156 | return losses
157 |
158 | def forward_test(self,
159 | points=None,
160 | img_metas=None,
161 | **kwargs):
162 | """
163 | Args:
164 | points (list[torch.Tensor]): the outer list indicates test-time
165 | augmentations and inner torch.Tensor should have a shape NxC,
166 | which contains all points in the batch.
167 | img_metas (list[list[dict]]): the outer list indicates test-time
168 | augs (multiscale, flip, etc.) and the inner list indicates
169 | images in a batch
170 | img (list[torch.Tensor], optional): the outer
171 | list indicates test-time augmentations and inner
172 | torch.Tensor should have a shape NxCxHxW, which contains
173 | all images in the batch. Defaults to None.
174 | """
175 | if points is None:
176 | points = [None]
177 | for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
178 | if not isinstance(var, list):
179 | raise TypeError('{} must be a list, but got {}'.format(
180 | name, type(var)))
181 |
182 | num_augs = len(points)
183 | if num_augs != len(img_metas):
184 | raise ValueError(
185 | 'num of augmentations ({}) != num of image meta ({})'.format(
186 | len(points), len(img_metas)))
187 |
188 | if num_augs == 1:
189 | return self.simple_test(points[0], img_metas[0],**kwargs)
190 | else:
191 | return self.aug_test(points, img_metas, **kwargs)
192 |
193 | @force_fp32(apply_to=('x'))
194 | def simple_test_pts(self, x, img_metas, rescale=False):
195 | """Test function of point cloud branch."""
196 | outs = self.pts_bbox_head(x, img_metas)
197 | bbox_list = self.pts_bbox_head.get_bboxes(
198 | outs, img_metas, rescale=rescale)
199 | bbox_results = [
200 | bbox3d2result(bboxes, scores, labels)
201 | for bboxes, scores, labels in bbox_list
202 | ]
203 | return bbox_results
204 |
205 | def simple_test(self, points, img_metas, rescale=False):
206 | """Test function without augmentaiton."""
207 |
208 | pts_feats = self.extract_feat(
209 | points, img_metas=img_metas)
210 | bbox_list = [dict() for i in range(len(img_metas))]
211 | if self.with_pts_bbox:
212 | bbox_pts = self.simple_test_pts(
213 | pts_feats, img_metas, rescale=rescale)
214 | for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
215 | result_dict['pts_bbox'] = pts_bbox
216 | return bbox_list
217 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/backbones/voxelnext.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmcv.runner import auto_fp16
3 | from torch import nn as nn
4 |
5 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
6 | # from mmdet3d.ops import spconv as spconv
7 | from mmdet3d.models.builder import MIDDLE_ENCODERS
8 | import torch
9 | from mmcv.cnn import build_conv_layer, build_norm_layer
10 | from torch import nn
11 |
12 | # from mmdet3d.ops import spconv
13 | import spconv.pytorch as spconv
14 | from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
15 |
16 |
17 | @MIDDLE_ENCODERS.register_module()
18 | class VoxelNextEncoder(nn.Module):
19 | r"""Sparse encoder for SECOND and Part-A2.
20 |
21 | Args:
22 | in_channels (int): The number of input channels.
23 | sparse_shape (list[int]): The sparse shape of input tensor.
24 | order (list[str]): Order of conv module. Defaults to ('conv',
25 | 'norm', 'act').
26 | norm_cfg (dict): Config of normalization layer. Defaults to
27 | dict(type='BN1d', eps=1e-3, momentum=0.01).
28 | base_channels (int): Out channels for conv_input layer.
29 | Defaults to 16.
30 | output_channels (int): Out channels for conv_out layer.
31 | Defaults to 128.
32 | encoder_channels (tuple[tuple[int]]):
33 | Convolutional channels of each encode block.
34 | encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
35 | Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
36 | block_type (str): Type of the block to use. Defaults to 'conv_module'.
37 | """
38 |
39 | def __init__(self,
40 | in_channels,
41 | sparse_shape,
42 | order=('conv', 'norm', 'act'),
43 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
44 | base_channels=16,
45 | output_channels=128,
46 | encoder_channels=((16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128),(128, 128, 128),(128, 128, 128)),
47 | encoder_paddings=((1, 1 ), (1, 1, 1), (1, 1, 1), (1, 1, 1),(1, 1, 1),(1, 1, 1)),
48 | sparse_conv_kernel = (3, 3, 3, 3, 3),
49 | block_type='basicblock'):
50 | super().__init__()
51 | assert block_type in ['conv_module', 'basicblock']
52 | self.sparse_shape = sparse_shape
53 | self.in_channels = in_channels
54 | self.order = order
55 | self.base_channels = base_channels
56 | self.output_channels = output_channels
57 | self.encoder_channels = encoder_channels
58 | self.encoder_paddings = encoder_paddings
59 | self.stage_num = len(self.encoder_channels)
60 | self.sparse_conv_kernel = sparse_conv_kernel
61 | self.fp16_enabled = False
62 | # Spconv init all weight on its own
63 |
64 | assert isinstance(order, tuple) and len(order) == 3
65 | assert set(order) == {'conv', 'norm', 'act'}
66 |
67 | if self.order[0] != 'conv': # pre activate
68 | self.conv_input = make_sparse_convmodule(
69 | in_channels,
70 | self.base_channels,
71 | 3,
72 | norm_cfg=norm_cfg,
73 | padding=1,
74 | indice_key='subm1',
75 | conv_type='SubMConv3d',
76 | order=('conv', ))
77 | else: # post activate
78 | self.conv_input = make_sparse_convmodule(
79 | in_channels,
80 | self.base_channels,
81 | 3,
82 | norm_cfg=norm_cfg,
83 | padding=1,
84 | indice_key='subm1',
85 | conv_type='SubMConv3d')
86 |
87 | encoder_out_channels = self.make_encoder_layers(
88 | make_sparse_convmodule,
89 | norm_cfg,
90 | self.base_channels,
91 | block_type=block_type)
92 |
93 | self.conv_out = make_sparse_convmodule(
94 | encoder_out_channels,
95 | self.output_channels,
96 | kernel_size=3,
97 | stride=1,
98 | norm_cfg=norm_cfg,
99 | padding=1,
100 | indice_key='spconv_down2',
101 | conv_type='SparseConv2d')
102 |
103 | self.shared_out = make_sparse_convmodule(
104 | self.output_channels,
105 | self.output_channels,
106 | kernel_size=3,
107 | stride=1,
108 | norm_cfg=norm_cfg,
109 | padding=1,
110 | indice_key='spconv_out',
111 | conv_type='SubMConv2d')
112 |
113 | @auto_fp16(apply_to=('voxel_features', ))
114 | def forward(self, voxel_features, coors, batch_size):
115 | """Forward of SparseEncoder.
116 |
117 | Args:
118 | voxel_features (torch.float32): Voxel features in shape (N, C).
119 | coors (torch.int32): Coordinates in shape (N, 4), \
120 | the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
121 | batch_size (int): Batch size.
122 |
123 | Returns:
124 | dict: Backbone features.
125 | """
126 | coors = coors.int()
127 | input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
128 | self.sparse_shape,
129 | batch_size)
130 | x = self.conv_input(input_sp_tensor)
131 |
132 | encode_features = []
133 | for encoder_layer in self.encoder_layers:
134 | x = encoder_layer(x)
135 | encode_features.append(x)
136 |
137 | encode_features[4].indices[:, 1:] *= 2
138 | encode_features[5].indices[:, 1:] *= 4
139 | encode_features[3] = encode_features[3].replace_feature(torch.cat([encode_features[3].features, encode_features[4].features, encode_features[5].features]))
140 | encode_features[3].indices = torch.cat([ encode_features[3].indices, encode_features[4].indices, encode_features[5].indices])
141 |
142 | out = self.bev_out(encode_features[3])
143 | out = self.conv_out(out)
144 | out = self.shared_out(out)
145 |
146 |
147 | return [out]
148 |
149 | def bev_out(self, x_conv):
150 | features_cat = x_conv.features
151 | indices_cat = x_conv.indices[:, [0, 2, 3]]
152 | spatial_shape = x_conv.spatial_shape[1:]
153 |
154 | indices_unique, _inv = torch.unique(indices_cat, dim=0, return_inverse=True)
155 | features_unique = features_cat.new_zeros((indices_unique.shape[0], features_cat.shape[1]))
156 | features_unique.index_add_(0, _inv, features_cat)
157 |
158 | x_out = spconv.SparseConvTensor(
159 | features=features_unique,
160 | indices=indices_unique,
161 | spatial_shape=spatial_shape,
162 | batch_size=x_conv.batch_size
163 | )
164 | return x_out
165 | def make_encoder_layers(self,
166 | make_block,
167 | norm_cfg,
168 | in_channels,
169 | block_type='conv_module',
170 | conv_cfg=dict(type='SubMConv3d')):
171 | """make encoder layers using sparse convs.
172 |
173 | Args:
174 | make_block (method): A bounded function to build blocks.
175 | norm_cfg (dict[str]): Config of normalization layer.
176 | in_channels (int): The number of encoder input channels.
177 | block_type (str): Type of the block to use. Defaults to
178 | 'conv_module'.
179 | conv_cfg (dict): Config of conv layer. Defaults to
180 | dict(type='SubMConv3d').
181 |
182 | Returns:
183 | int: The number of encoder output channels.
184 | """
185 | assert block_type in ['conv_module', 'basicblock']
186 | self.encoder_layers = spconv.SparseSequential()
187 |
188 | for i, blocks in enumerate(self.encoder_channels):
189 | blocks_list = []
190 | for j, out_channels in enumerate(tuple(blocks)):
191 | padding = tuple(self.encoder_paddings[i])[j]
192 | # each stage started with a spconv layer
193 | # except the first stage
194 | if i != 0 and j == 0 and block_type == 'conv_module':
195 | blocks_list.append(
196 | make_block(
197 | in_channels,
198 | out_channels,
199 | 3,
200 | norm_cfg=norm_cfg,
201 | stride=2,
202 | padding=padding,
203 | indice_key=f'spconv{i + 1}',
204 | conv_type='SparseConv3d'))
205 | elif block_type == 'basicblock':
206 | if j == 0 and len(blocks) > 2:
207 | blocks_list.append(
208 | make_block(
209 | in_channels,
210 | out_channels,
211 | self.sparse_conv_kernel[i - 1],
212 | norm_cfg=norm_cfg,
213 | stride=2,
214 | padding=int(self.sparse_conv_kernel[i - 1]//2),
215 | indice_key=f'spconv{i + 1}',
216 | conv_type='SparseConv3d'))
217 | else:
218 | blocks_list.append(
219 | SparseBasicBlock(
220 | out_channels,
221 | out_channels,
222 | norm_cfg=norm_cfg,
223 | conv_cfg=conv_cfg))
224 | else:
225 | blocks_list.append(
226 | make_block(
227 | in_channels,
228 | out_channels,
229 | 3,
230 | norm_cfg=norm_cfg,
231 | padding=padding,
232 | indice_key=f'subm{i + 1}',
233 | conv_type='SubMConv3d'))
234 | in_channels = out_channels
235 | stage_name = f'encoder_layer{i + 1}'
236 | stage_layers = spconv.SparseSequential(*blocks_list)
237 | self.encoder_layers.add_module(stage_name, stage_layers)
238 | return out_channels
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | CMT
2 |
3 | Apache License
4 | Version 2.0, January 2004
5 | http://www.apache.org/licenses/
6 |
7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8 |
9 | 1. Definitions.
10 |
11 | "License" shall mean the terms and conditions for use, reproduction,
12 | and distribution as defined by Sections 1 through 9 of this document.
13 |
14 | "Licensor" shall mean the copyright owner or entity authorized by
15 | the copyright owner that is granting the License.
16 |
17 | "Legal Entity" shall mean the union of the acting entity and all
18 | other entities that control, are controlled by, or are under common
19 | control with that entity. For the purposes of this definition,
20 | "control" means (i) the power, direct or indirect, to cause the
21 | direction or management of such entity, whether by contract or
22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
23 | outstanding shares, or (iii) beneficial ownership of such entity.
24 |
25 | "You" (or "Your") shall mean an individual or Legal Entity
26 | exercising permissions granted by this License.
27 |
28 | "Source" form shall mean the preferred form for making modifications,
29 | including but not limited to software source code, documentation
30 | source, and configuration files.
31 |
32 | "Object" form shall mean any form resulting from mechanical
33 | transformation or translation of a Source form, including but
34 | not limited to compiled object code, generated documentation,
35 | and conversions to other media types.
36 |
37 | "Work" shall mean the work of authorship, whether in Source or
38 | Object form, made available under the License, as indicated by a
39 | copyright notice that is included in or attached to the work
40 | (an example is provided in the Appendix below).
41 |
42 | "Derivative Works" shall mean any work, whether in Source or Object
43 | form, that is based on (or derived from) the Work and for which the
44 | editorial revisions, annotations, elaborations, or other modifications
45 | represent, as a whole, an original work of authorship. For the purposes
46 | of this License, Derivative Works shall not include works that remain
47 | separable from, or merely link (or bind by name) to the interfaces of,
48 | the Work and Derivative Works thereof.
49 |
50 | "Contribution" shall mean any work of authorship, including
51 | the original version of the Work and any modifications or additions
52 | to that Work or Derivative Works thereof, that is intentionally
53 | submitted to Licensor for inclusion in the Work by the copyright owner
54 | or by an individual or Legal Entity authorized to submit on behalf of
55 | the copyright owner. For the purposes of this definition, "submitted"
56 | means any form of electronic, verbal, or written communication sent
57 | to the Licensor or its representatives, including but not limited to
58 | communication on electronic mailing lists, source code control systems,
59 | and issue tracking systems that are managed by, or on behalf of, the
60 | Licensor for the purpose of discussing and improving the Work, but
61 | excluding communication that is conspicuously marked or otherwise
62 | designated in writing by the copyright owner as "Not a Contribution."
63 |
64 | "Contributor" shall mean Licensor and any individual or Legal Entity
65 | on behalf of whom a Contribution has been received by Licensor and
66 | subsequently incorporated within the Work.
67 |
68 | 2. Grant of Copyright License. Subject to the terms and conditions of
69 | this License, each Contributor hereby grants to You a perpetual,
70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71 | copyright license to reproduce, prepare Derivative Works of,
72 | publicly display, publicly perform, sublicense, and distribute the
73 | Work and such Derivative Works in Source or Object form.
74 |
75 | 3. Grant of Patent License. Subject to the terms and conditions of
76 | this License, each Contributor hereby grants to You a perpetual,
77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78 | (except as stated in this section) patent license to make, have made,
79 | use, offer to sell, sell, import, and otherwise transfer the Work,
80 | where such license applies only to those patent claims licensable
81 | by such Contributor that are necessarily infringed by their
82 | Contribution(s) alone or by combination of their Contribution(s)
83 | with the Work to which such Contribution(s) was submitted. If You
84 | institute patent litigation against any entity (including a
85 | cross-claim or counterclaim in a lawsuit) alleging that the Work
86 | or a Contribution incorporated within the Work constitutes direct
87 | or contributory patent infringement, then any patent licenses
88 | granted to You under this License for that Work shall terminate
89 | as of the date such litigation is filed.
90 |
91 | 4. Redistribution. You may reproduce and distribute copies of the
92 | Work or Derivative Works thereof in any medium, with or without
93 | modifications, and in Source or Object form, provided that You
94 | meet the following conditions:
95 |
96 | (a) You must give any other recipients of the Work or
97 | Derivative Works a copy of this License; and
98 |
99 | (b) You must cause any modified files to carry prominent notices
100 | stating that You changed the files; and
101 |
102 | (c) You must retain, in the Source form of any Derivative Works
103 | that You distribute, all copyright, patent, trademark, and
104 | attribution notices from the Source form of the Work,
105 | excluding those notices that do not pertain to any part of
106 | the Derivative Works; and
107 |
108 | (d) If the Work includes a "NOTICE" text file as part of its
109 | distribution, then any Derivative Works that You distribute must
110 | include a readable copy of the attribution notices contained
111 | within such NOTICE file, excluding those notices that do not
112 | pertain to any part of the Derivative Works, in at least one
113 | of the following places: within a NOTICE text file distributed
114 | as part of the Derivative Works; within the Source form or
115 | documentation, if provided along with the Derivative Works; or,
116 | within a display generated by the Derivative Works, if and
117 | wherever such third-party notices normally appear. The contents
118 | of the NOTICE file are for informational purposes only and
119 | do not modify the License. You may add Your own attribution
120 | notices within Derivative Works that You distribute, alongside
121 | or as an addendum to the NOTICE text from the Work, provided
122 | that such additional attribution notices cannot be construed
123 | as modifying the License.
124 |
125 | You may add Your own copyright statement to Your modifications and
126 | may provide additional or different license terms and conditions
127 | for use, reproduction, or distribution of Your modifications, or
128 | for any such Derivative Works as a whole, provided Your use,
129 | reproduction, and distribution of the Work otherwise complies with
130 | the conditions stated in this License.
131 |
132 | 5. Submission of Contributions. Unless You explicitly state otherwise,
133 | any Contribution intentionally submitted for inclusion in the Work
134 | by You to the Licensor shall be under the terms and conditions of
135 | this License, without any additional terms or conditions.
136 | Notwithstanding the above, nothing herein shall supersede or modify
137 | the terms of any separate license agreement you may have executed
138 | with Licensor regarding such Contributions.
139 |
140 | 6. Trademarks. This License does not grant permission to use the trade
141 | names, trademarks, service marks, or product names of the Licensor,
142 | except as required for reasonable and customary use in describing the
143 | origin of the Work and reproducing the content of the NOTICE file.
144 |
145 | 7. Disclaimer of Warranty. Unless required by applicable law or
146 | agreed to in writing, Licensor provides the Work (and each
147 | Contributor provides its Contributions) on an "AS IS" BASIS,
148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 | implied, including, without limitation, any warranties or conditions
150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 | PARTICULAR PURPOSE. You are solely responsible for determining the
152 | appropriateness of using or redistributing the Work and assume any
153 | risks associated with Your exercise of permissions under this License.
154 |
155 | 8. Limitation of Liability. In no event and under no legal theory,
156 | whether in tort (including negligence), contract, or otherwise,
157 | unless required by applicable law (such as deliberate and grossly
158 | negligent acts) or agreed to in writing, shall any Contributor be
159 | liable to You for damages, including any direct, indirect, special,
160 | incidental, or consequential damages of any character arising as a
161 | result of this License or out of the use or inability to use the
162 | Work (including but not limited to damages for loss of goodwill,
163 | work stoppage, computer failure or malfunction, or any and all
164 | other commercial damages or losses), even if such Contributor
165 | has been advised of the possibility of such damages.
166 |
167 | 9. Accepting Warranty or Additional Liability. While redistributing
168 | the Work or Derivative Works thereof, You may choose to offer,
169 | and charge a fee for, acceptance of support, warranty, indemnity,
170 | or other liability obligations and/or rights consistent with this
171 | License. However, in accepting such obligations, You may act only
172 | on Your own behalf and on Your sole responsibility, not on behalf
173 | of any other Contributor, and only if You agree to indemnify,
174 | defend, and hold each Contributor harmless for any liability
175 | incurred by, or claims asserted against, such Contributor by reason
176 | of your accepting any such warranty or additional liability.
177 |
178 | END OF TERMS AND CONDITIONS
179 |
180 | Copyright (c) 2023 Megvii Inc. All rights reserved.
181 |
182 | Licensed under the Apache License, Version 2.0 (the "License");
183 | you may not use this file except in compliance with the License.
184 | You may obtain a copy of the License at
185 |
186 | http://www.apache.org/licenses/LICENSE-2.0
187 |
188 | Unless required by applicable law or agreed to in writing, software
189 | distributed under the License is distributed on an "AS IS" BASIS,
190 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
191 | See the License for the specific language governing permissions and
192 | limitations under the License.
193 |
--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_voxel0075_cbgs_20e.py:
--------------------------------------------------------------------------------
1 | plugin=True
2 | plugin_dir='projects/mmdet3d_plugin/'
3 |
4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
5 | class_names = [
6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
8 | ]
9 | voxel_size = [0.075, 0.075, 0.2]
10 | out_size_factor = 8
11 | evaluation = dict(interval=20)
12 | dataset_type = 'CustomNuScenesDataset'
13 | data_root = 'data/nuscenes/'
14 | input_modality = dict(
15 | use_lidar=True,
16 | use_camera=False,
17 | use_radar=False,
18 | use_map=False,
19 | use_external=False)
20 | train_pipeline = [
21 | dict(
22 | type='LoadPointsFromFile',
23 | coord_type='LIDAR',
24 | load_dim=5,
25 | use_dim=[0, 1, 2, 3, 4],
26 | ),
27 | dict(
28 | type='LoadPointsFromMultiSweeps',
29 | sweeps_num=10,
30 | use_dim=[0, 1, 2, 3, 4],
31 | ),
32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
33 | dict(
34 | type='ObjectSample',
35 | db_sampler=dict(
36 | data_root='data/nuscenes/',
37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
38 | rate=1.0,
39 | prepare=dict(
40 | filter_by_difficulty=[-1],
41 | filter_by_min_points=dict(
42 | car=5,
43 | truck=5,
44 | bus=5,
45 | trailer=5,
46 | construction_vehicle=5,
47 | traffic_cone=5,
48 | barrier=5,
49 | motorcycle=5,
50 | bicycle=5,
51 | pedestrian=5)),
52 | classes=class_names,
53 | sample_groups=dict(
54 | car=2,
55 | truck=3,
56 | construction_vehicle=7,
57 | bus=4,
58 | trailer=6,
59 | barrier=2,
60 | motorcycle=6,
61 | bicycle=6,
62 | pedestrian=2,
63 | traffic_cone=2),
64 | points_loader=dict(
65 | type='LoadPointsFromFile',
66 | coord_type='LIDAR',
67 | load_dim=5,
68 | use_dim=[0, 1, 2, 3, 4],
69 | ))),
70 | dict(
71 | type='GlobalRotScaleTrans',
72 | rot_range=[-0.3925 * 2, 0.3925 * 2],
73 | scale_ratio_range=[0.9, 1.1],
74 | translation_std=[0.5, 0.5, 0.5]),
75 | dict(
76 | type='RandomFlip3D',
77 | sync_2d=False,
78 | flip_ratio_bev_horizontal=0.5,
79 | flip_ratio_bev_vertical=0.5),
80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
82 | dict(type='ObjectNameFilter', classes=class_names),
83 | dict(type='PointShuffle'),
84 | dict(type='DefaultFormatBundle3D', class_names=class_names),
85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
87 | 'depth2img', 'cam2img', 'pad_shape',
88 | 'scale_factor', 'flip', 'pcd_horizontal_flip',
89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx',
91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
92 | 'transformation_3d_flow', 'rot_degree',
93 | 'gt_bboxes_3d', 'gt_labels_3d'))
94 | ]
95 | test_pipeline = [
96 | dict(
97 | type='LoadPointsFromFile',
98 | coord_type='LIDAR',
99 | load_dim=5,
100 | use_dim=[0, 1, 2, 3, 4],
101 | ),
102 | dict(
103 | type='LoadPointsFromMultiSweeps',
104 | sweeps_num=10,
105 | use_dim=[0, 1, 2, 3, 4],
106 | ),
107 | dict(
108 | type='MultiScaleFlipAug3D',
109 | img_scale=(1333, 800),
110 | pts_scale_ratio=1,
111 | flip=False,
112 | transforms=[
113 | dict(
114 | type='GlobalRotScaleTrans',
115 | rot_range=[0, 0],
116 | scale_ratio_range=[1.0, 1.0],
117 | translation_std=[0, 0, 0]),
118 | dict(type='RandomFlip3D'),
119 | dict(
120 | type='DefaultFormatBundle3D',
121 | class_names=class_names,
122 | with_label=False),
123 | dict(type='Collect3D', keys=['points'])
124 | ])
125 | ]
126 | data = dict(
127 | samples_per_gpu=2,
128 | workers_per_gpu=4,
129 | train=dict(
130 | type='CBGSDataset',
131 | dataset=dict(
132 | type=dataset_type,
133 | data_root=data_root,
134 | ann_file=data_root + '/nuscenes_infos_train.pkl',
135 | load_interval=1,
136 | pipeline=train_pipeline,
137 | classes=class_names,
138 | modality=input_modality,
139 | test_mode=False,
140 | box_type_3d='LiDAR')),
141 | val=dict(
142 | type=dataset_type,
143 | data_root=data_root,
144 | ann_file=data_root + '/nuscenes_infos_val.pkl',
145 | load_interval=1,
146 | pipeline=test_pipeline,
147 | classes=class_names,
148 | modality=input_modality,
149 | test_mode=True,
150 | box_type_3d='LiDAR'),
151 | test=dict(
152 | type=dataset_type,
153 | data_root=data_root,
154 | ann_file=data_root + '/nuscenes_infos_val.pkl',
155 | load_interval=1,
156 | pipeline=test_pipeline,
157 | classes=class_names,
158 | modality=input_modality,
159 | test_mode=True,
160 | box_type_3d='LiDAR'))
161 | model = dict(
162 | type='FSTRDetector',
163 | pts_voxel_layer=dict(
164 | num_point_features=5,
165 | max_num_points=10,
166 | voxel_size=voxel_size,
167 | max_voxels=(120000, 160000),
168 | point_cloud_range=point_cloud_range),
169 | pts_voxel_encoder=dict(
170 | type='HardSimpleVFE',
171 | num_features=5,
172 | ),
173 | pts_middle_encoder=dict(
174 | type='VoxelNextEncoder',
175 | in_channels=5,
176 | sparse_shape=[41, 1440, 1440],
177 | base_channels=16,
178 | output_channels=128,
179 | order=('conv', 'norm', 'act'),
180 | block_type='basicblock'),
181 |
182 | pts_bbox_head=dict(
183 | type='FSTRHead',
184 | in_channels=128,
185 | hidden_dim=256,
186 | downsample_scale=8,
187 | num_query=500,
188 | num_init_query=200,
189 | init_dn_query = False,
190 | init_learnable_query = False,
191 | init_query_topk = 1,
192 | init_query_radius = 1,
193 | gauusian_dn_sampling=False,
194 | noise_mean = 0.5,
195 | noise_std = 0.125,
196 | max_sparse_token_per_sample = 10000,
197 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
198 | tasks=[
199 | dict(num_class=10, class_names=[
200 | 'car', 'truck', 'construction_vehicle',
201 | 'bus', 'trailer', 'barrier',
202 | 'motorcycle', 'bicycle',
203 | 'pedestrian', 'traffic_cone'
204 | ]),
205 | ],
206 | bbox_coder=dict(
207 | type='MultiTaskBBoxCoder',
208 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
209 | pc_range=point_cloud_range,
210 | max_num=300,
211 | voxel_size=voxel_size,
212 | num_classes=10),
213 | separate_head=dict(
214 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
215 | transformer=dict(
216 | type='FSTRTransformer',
217 | decoder=dict(
218 | type='PETRTransformerDecoder',
219 | return_intermediate=True,
220 | num_layers=1,
221 | transformerlayers=dict(
222 | type='PETRTransformerDecoderLayer',
223 | attn_cfgs=[
224 | dict(
225 | type='MultiheadAttention',
226 | embed_dims=256,
227 | num_heads=8,
228 | dropout=0.1),
229 | dict(
230 | type='PETRMultiheadFlashAttention',
231 | embed_dims=256,
232 | num_heads=8,
233 | dropout=0.1),
234 | ],
235 | ffn_cfgs=dict(
236 | type='FFN',
237 | embed_dims=256,
238 | feedforward_channels=1024,
239 | num_fcs=2,
240 | ffn_drop=0.,
241 | act_cfg=dict(type='ReLU', inplace=True),
242 | ),
243 |
244 | feedforward_channels=1024, #unused
245 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
246 | 'ffn', 'norm')),
247 | )),
248 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
249 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
250 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
251 | ),
252 | train_cfg=dict(
253 | pts=dict(
254 | dataset='nuScenes',
255 | assigner=dict(
256 | type='HungarianAssigner3D',
257 | cls_cost=dict(type='FocalLossCost', weight=2.0),
258 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
259 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
260 | pc_range=point_cloud_range,
261 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
262 | ),
263 | pos_weight=-1,
264 | gaussian_overlap=0.1,
265 | min_radius=2,
266 | grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
267 | voxel_size=voxel_size,
268 | out_size_factor=out_size_factor,
269 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
270 | point_cloud_range=point_cloud_range)),
271 | test_cfg=dict(
272 | pts=dict(
273 | dataset='nuScenes',
274 | grid_size=[1440, 1440, 40],
275 | out_size_factor=out_size_factor,
276 | pc_range=point_cloud_range[0:2],
277 | voxel_size=voxel_size[:2],
278 | nms_type=None,
279 | nms_thr=0.1,
280 | use_rotate_nms=True,
281 | max_num=300
282 | )))
283 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
284 | optimizer_config = dict(
285 | type='CustomFp16OptimizerHook',
286 | loss_scale=512.,
287 | grad_clip=dict(max_norm=35, norm_type=2),
288 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
289 | lr_config = dict(
290 | policy='cyclic',
291 | target_ratio=(8, 0.0001),
292 | cyclic_times=1,
293 | step_ratio_up=0.4)
294 | momentum_config = dict(
295 | policy='cyclic',
296 | target_ratio=(0.8947368421052632, 1),
297 | cyclic_times=1,
298 | step_ratio_up=0.4)
299 | total_epochs = 20
300 | checkpoint_config = dict(interval=1)
301 | evaluation = dict(interval=5, pipeline=test_pipeline)
302 | log_config = dict(
303 | interval=50,
304 | hooks=[dict(type='TextLoggerHook'),
305 | dict(type='TensorboardLoggerHook')])
306 | dist_params = dict(backend='nccl')
307 | log_level = 'INFO'
308 | work_dir = None
309 | load_from = None
310 | resume_from = None
311 | workflow = [('train', 1)]
312 | gpu_ids = range(0, 8)
313 |
314 |
--------------------------------------------------------------------------------
/tools/test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import os
4 | import warnings
5 |
6 | import mmcv
7 | import torch
8 | from mmcv import Config, DictAction
9 | from mmcv.cnn import fuse_conv_bn
10 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
11 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
12 | wrap_fp16_model)
13 |
14 | import mmdet
15 | from mmdet3d.apis import single_gpu_test
16 | from mmdet3d.datasets import build_dataloader, build_dataset
17 | from mmdet3d.models import build_model
18 | from mmdet.apis import multi_gpu_test, set_random_seed
19 | from mmdet.datasets import replace_ImageToTensor
20 |
21 | if mmdet.__version__ > '2.23.0':
22 | # If mmdet version > 2.23.0, setup_multi_processes would be imported and
23 | # used from mmdet instead of mmdet3d.
24 | from mmdet.utils import setup_multi_processes
25 | else:
26 | from mmdet3d.utils import setup_multi_processes
27 |
28 | try:
29 | # If mmdet version > 2.23.0, compat_cfg would be imported and
30 | # used from mmdet instead of mmdet3d.
31 | from mmdet.utils import compat_cfg
32 | except ImportError:
33 | from mmdet3d.utils import compat_cfg
34 |
35 |
36 | def parse_args():
37 | parser = argparse.ArgumentParser(
38 | description='MMDet test (and eval) a model')
39 | parser.add_argument('config', help='test config file path')
40 | parser.add_argument('checkpoint', help='checkpoint file')
41 | parser.add_argument('--out', help='output result file in pickle format')
42 | parser.add_argument(
43 | '--fuse-conv-bn',
44 | action='store_true',
45 | help='Whether to fuse conv and bn, this will slightly increase'
46 | 'the inference speed')
47 | parser.add_argument(
48 | '--gpu-ids',
49 | type=int,
50 | nargs='+',
51 | help='(Deprecated, please use --gpu-id) ids of gpus to use '
52 | '(only applicable to non-distributed training)')
53 | parser.add_argument(
54 | '--gpu-id',
55 | type=int,
56 | default=0,
57 | help='id of gpu to use '
58 | '(only applicable to non-distributed testing)')
59 | parser.add_argument(
60 | '--format-only',
61 | action='store_true',
62 | help='Format the output results without perform evaluation. It is'
63 | 'useful when you want to format the result to a specific format and '
64 | 'submit it to the test server')
65 | parser.add_argument(
66 | '--eval',
67 | type=str,
68 | nargs='+',
69 | help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
70 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
71 | parser.add_argument('--show', action='store_true', help='show results')
72 | parser.add_argument(
73 | '--show-dir', help='directory where results will be saved')
74 | parser.add_argument(
75 | '--gpu-collect',
76 | action='store_true',
77 | help='whether to use gpu to collect results.')
78 | parser.add_argument(
79 | '--tmpdir',
80 | help='tmp directory used for collecting results from multiple '
81 | 'workers, available when gpu-collect is not specified')
82 | parser.add_argument('--seed', type=int, default=0, help='random seed')
83 | parser.add_argument(
84 | '--deterministic',
85 | action='store_true',
86 | help='whether to set deterministic options for CUDNN backend.')
87 | parser.add_argument(
88 | '--cfg-options',
89 | nargs='+',
90 | action=DictAction,
91 | help='override some settings in the used config, the key-value pair '
92 | 'in xxx=yyy format will be merged into config file. If the value to '
93 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
94 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
95 | 'Note that the quotation marks are necessary and that no white space '
96 | 'is allowed.')
97 | parser.add_argument(
98 | '--options',
99 | nargs='+',
100 | action=DictAction,
101 | help='custom options for evaluation, the key-value pair in xxx=yyy '
102 | 'format will be kwargs for dataset.evaluate() function (deprecate), '
103 | 'change to --eval-options instead.')
104 | parser.add_argument(
105 | '--eval-options',
106 | nargs='+',
107 | action=DictAction,
108 | help='custom options for evaluation, the key-value pair in xxx=yyy '
109 | 'format will be kwargs for dataset.evaluate() function')
110 | parser.add_argument(
111 | '--launcher',
112 | choices=['none', 'pytorch', 'slurm', 'mpi'],
113 | default='none',
114 | help='job launcher')
115 | parser.add_argument('--local_rank', type=int, default=0)
116 | args = parser.parse_args()
117 | if 'LOCAL_RANK' not in os.environ:
118 | os.environ['LOCAL_RANK'] = str(args.local_rank)
119 |
120 | if args.options and args.eval_options:
121 | raise ValueError(
122 | '--options and --eval-options cannot be both specified, '
123 | '--options is deprecated in favor of --eval-options')
124 | if args.options:
125 | warnings.warn('--options is deprecated in favor of --eval-options')
126 | args.eval_options = args.options
127 | return args
128 |
129 |
130 | def main():
131 | args = parse_args()
132 |
133 | assert args.out or args.eval or args.format_only or args.show \
134 | or args.show_dir, \
135 | ('Please specify at least one operation (save/eval/format/show the '
136 | 'results / save the results) with the argument "--out", "--eval"'
137 | ', "--format-only", "--show" or "--show-dir"')
138 |
139 | if args.eval and args.format_only:
140 | raise ValueError('--eval and --format_only cannot be both specified')
141 |
142 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
143 | raise ValueError('The output file must be a pkl file.')
144 |
145 | cfg = Config.fromfile(args.config)
146 | if args.cfg_options is not None:
147 | cfg.merge_from_dict(args.cfg_options)
148 |
149 | # import modules from string list.
150 | if cfg.get('custom_imports', None):
151 | from mmcv.utils import import_modules_from_strings
152 | import_modules_from_strings(**cfg['custom_imports'])
153 |
154 | # import modules from plguin/xx, registry will be updated
155 | if hasattr(cfg, 'plugin'):
156 | if cfg.plugin:
157 | import importlib
158 | if hasattr(cfg, 'plugin_dir'):
159 | plugin_dir = cfg.plugin_dir
160 | _module_dir = os.path.dirname(plugin_dir)
161 | _module_dir = _module_dir.split('/')
162 | _module_path = _module_dir[0]
163 |
164 | for m in _module_dir[1:]:
165 | _module_path = _module_path + '.' + m
166 | print(_module_path)
167 | plg_lib = importlib.import_module(_module_path)
168 | else:
169 | # import dir is the dirpath for the config file
170 | _module_dir = os.path.dirname(args.config)
171 | _module_dir = _module_dir.split('/')
172 | _module_path = _module_dir[0]
173 | for m in _module_dir[1:]:
174 | _module_path = _module_path + '.' + m
175 | print(_module_path)
176 | plg_lib = importlib.import_module(_module_path)
177 |
178 | cfg = compat_cfg(cfg)
179 |
180 | # set multi-process settings
181 | setup_multi_processes(cfg)
182 |
183 | # set cudnn_benchmark
184 | if cfg.get('cudnn_benchmark', False):
185 | torch.backends.cudnn.benchmark = True
186 |
187 | cfg.model.pretrained = None
188 |
189 | if args.gpu_ids is not None:
190 | cfg.gpu_ids = args.gpu_ids[0:1]
191 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
192 | 'Because we only support single GPU mode in '
193 | 'non-distributed testing. Use the first GPU '
194 | 'in `gpu_ids` now.')
195 | else:
196 | cfg.gpu_ids = [args.gpu_id]
197 |
198 | # init distributed env first, since logger depends on the dist info.
199 | if args.launcher == 'none':
200 | distributed = False
201 | else:
202 | distributed = True
203 | init_dist(args.launcher, **cfg.dist_params)
204 |
205 | test_dataloader_default_args = dict(
206 | samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False)
207 |
208 | # in case the test dataset is concatenated
209 | if isinstance(cfg.data.test, dict):
210 | cfg.data.test.test_mode = True
211 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
212 | # Replace 'ImageToTensor' to 'DefaultFormatBundle'
213 | cfg.data.test.pipeline = replace_ImageToTensor(
214 | cfg.data.test.pipeline)
215 | elif isinstance(cfg.data.test, list):
216 | for ds_cfg in cfg.data.test:
217 | ds_cfg.test_mode = True
218 | if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
219 | for ds_cfg in cfg.data.test:
220 | ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
221 |
222 | test_loader_cfg = {
223 | **test_dataloader_default_args,
224 | **cfg.data.get('test_dataloader', {})
225 | }
226 |
227 | # set random seeds
228 | if args.seed is not None:
229 | set_random_seed(args.seed, deterministic=args.deterministic)
230 |
231 | # build the dataloader
232 | dataset = build_dataset(cfg.data.test)
233 | data_loader = build_dataloader(dataset, **test_loader_cfg)
234 |
235 | # build the model and load checkpoint
236 | cfg.model.train_cfg = None
237 | model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
238 | fp16_cfg = cfg.get('fp16', None)
239 | if fp16_cfg is not None:
240 | wrap_fp16_model(model)
241 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
242 | if args.fuse_conv_bn:
243 | model = fuse_conv_bn(model)
244 | # old versions did not save class info in checkpoints, this walkaround is
245 | # for backward compatibility
246 | if 'CLASSES' in checkpoint.get('meta', {}):
247 | model.CLASSES = checkpoint['meta']['CLASSES']
248 | else:
249 | model.CLASSES = dataset.CLASSES
250 | # palette for visualization in segmentation tasks
251 | if 'PALETTE' in checkpoint.get('meta', {}):
252 | model.PALETTE = checkpoint['meta']['PALETTE']
253 | elif hasattr(dataset, 'PALETTE'):
254 | # segmentation dataset has `PALETTE` attribute
255 | model.PALETTE = dataset.PALETTE
256 |
257 | if not distributed:
258 | model = MMDataParallel(model, device_ids=cfg.gpu_ids)
259 | outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
260 | else:
261 | model = MMDistributedDataParallel(
262 | model.cuda(),
263 | device_ids=[torch.cuda.current_device()],
264 | broadcast_buffers=False)
265 | outputs = multi_gpu_test(model, data_loader, args.tmpdir,
266 | args.gpu_collect)
267 |
268 | rank, _ = get_dist_info()
269 | if rank == 0:
270 | if args.out:
271 | print(f'\nwriting results to {args.out}')
272 | mmcv.dump(outputs, args.out)
273 | kwargs = {} if args.eval_options is None else args.eval_options
274 | if args.format_only:
275 | dataset.format_results(outputs, **kwargs)
276 | if args.eval:
277 | eval_kwargs = cfg.get('evaluation', {}).copy()
278 | # hard-code way to remove EvalHook args
279 | for key in [
280 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
281 | 'rule'
282 | ]:
283 | eval_kwargs.pop(key, None)
284 | eval_kwargs.update(dict(metric=args.eval, **kwargs))
285 | print(dataset.evaluate(outputs, **eval_kwargs))
286 |
287 |
288 | if __name__ == '__main__':
289 | main()
290 |
--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_large_voxel0075_cbgs_20e.py:
--------------------------------------------------------------------------------
1 | plugin=True
2 | plugin_dir='projects/mmdet3d_plugin/'
3 |
4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
5 | class_names = [
6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
8 | ]
9 | voxel_size = [0.075, 0.075, 0.2]
10 | out_size_factor = 8
11 | evaluation = dict(interval=20)
12 | dataset_type = 'CustomNuScenesDataset'
13 | data_root = 'data/nuscenes/'
14 | input_modality = dict(
15 | use_lidar=True,
16 | use_camera=False,
17 | use_radar=False,
18 | use_map=False,
19 | use_external=False)
20 | train_pipeline = [
21 | dict(
22 | type='LoadPointsFromFile',
23 | coord_type='LIDAR',
24 | load_dim=5,
25 | use_dim=[0, 1, 2, 3, 4],
26 | ),
27 | dict(
28 | type='LoadPointsFromMultiSweeps',
29 | sweeps_num=10,
30 | use_dim=[0, 1, 2, 3, 4],
31 | ),
32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
33 | dict(
34 | type='ObjectSample',
35 | db_sampler=dict(
36 | data_root='data/nuscenes/',
37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
38 | rate=1.0,
39 | prepare=dict(
40 | filter_by_difficulty=[-1],
41 | filter_by_min_points=dict(
42 | car=5,
43 | truck=5,
44 | bus=5,
45 | trailer=5,
46 | construction_vehicle=5,
47 | traffic_cone=5,
48 | barrier=5,
49 | motorcycle=5,
50 | bicycle=5,
51 | pedestrian=5)),
52 | classes=class_names,
53 | sample_groups=dict(
54 | car=2,
55 | truck=3,
56 | construction_vehicle=7,
57 | bus=4,
58 | trailer=6,
59 | barrier=2,
60 | motorcycle=6,
61 | bicycle=6,
62 | pedestrian=2,
63 | traffic_cone=2),
64 | points_loader=dict(
65 | type='LoadPointsFromFile',
66 | coord_type='LIDAR',
67 | load_dim=5,
68 | use_dim=[0, 1, 2, 3, 4],
69 | ))),
70 | dict(
71 | type='GlobalRotScaleTrans',
72 | rot_range=[-0.3925 * 2, 0.3925 * 2],
73 | scale_ratio_range=[0.9, 1.1],
74 | translation_std=[0.5, 0.5, 0.5]),
75 | dict(
76 | type='RandomFlip3D',
77 | sync_2d=False,
78 | flip_ratio_bev_horizontal=0.5,
79 | flip_ratio_bev_vertical=0.5),
80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
82 | dict(type='ObjectNameFilter', classes=class_names),
83 | dict(type='PointShuffle'),
84 | dict(type='DefaultFormatBundle3D', class_names=class_names),
85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
87 | 'depth2img', 'cam2img', 'pad_shape',
88 | 'scale_factor', 'flip', 'pcd_horizontal_flip',
89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx',
91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
92 | 'transformation_3d_flow', 'rot_degree',
93 | 'gt_bboxes_3d', 'gt_labels_3d'))
94 | ]
95 | test_pipeline = [
96 | dict(
97 | type='LoadPointsFromFile',
98 | coord_type='LIDAR',
99 | load_dim=5,
100 | use_dim=[0, 1, 2, 3, 4],
101 | ),
102 | dict(
103 | type='LoadPointsFromMultiSweeps',
104 | sweeps_num=10,
105 | use_dim=[0, 1, 2, 3, 4],
106 | ),
107 | dict(
108 | type='MultiScaleFlipAug3D',
109 | img_scale=(1333, 800),
110 | pts_scale_ratio=1,
111 | flip=False,
112 | transforms=[
113 | dict(
114 | type='GlobalRotScaleTrans',
115 | rot_range=[0, 0],
116 | scale_ratio_range=[1.0, 1.0],
117 | translation_std=[0, 0, 0]),
118 | dict(type='RandomFlip3D'),
119 | dict(
120 | type='DefaultFormatBundle3D',
121 | class_names=class_names,
122 | with_label=False),
123 | dict(type='Collect3D', keys=['points'])
124 | ])
125 | ]
126 | data = dict(
127 | samples_per_gpu=2,
128 | workers_per_gpu=4,
129 | train=dict(
130 | type='CBGSDataset',
131 | dataset=dict(
132 | type=dataset_type,
133 | data_root=data_root,
134 | ann_file=data_root + '/nuscenes_infos_train.pkl',
135 | load_interval=1,
136 | pipeline=train_pipeline,
137 | classes=class_names,
138 | modality=input_modality,
139 | test_mode=False,
140 | box_type_3d='LiDAR')),
141 | val=dict(
142 | type=dataset_type,
143 | data_root=data_root,
144 | ann_file=data_root + '/nuscenes_infos_val.pkl',
145 | load_interval=1,
146 | pipeline=test_pipeline,
147 | classes=class_names,
148 | modality=input_modality,
149 | test_mode=True,
150 | box_type_3d='LiDAR'),
151 | test=dict(
152 | type=dataset_type,
153 | data_root=data_root,
154 | ann_file=data_root + '/nuscenes_infos_val.pkl',
155 | load_interval=1,
156 | pipeline=test_pipeline,
157 | classes=class_names,
158 | modality=input_modality,
159 | test_mode=True,
160 | box_type_3d='LiDAR'))
161 | model = dict(
162 | type='FSTRDetector',
163 | pts_voxel_layer=dict(
164 | num_point_features=5,
165 | max_num_points=10,
166 | voxel_size=voxel_size,
167 | max_voxels=(120000, 160000),
168 | point_cloud_range=point_cloud_range),
169 | pts_voxel_encoder=dict(
170 | type='HardSimpleVFE',
171 | num_features=5,
172 | ),
173 | pts_middle_encoder=dict(
174 | type='VoxelNextEncoder',
175 | in_channels=5,
176 | sparse_shape=[41, 1440, 1440],
177 | base_channels=32,
178 | output_channels=256,
179 | encoder_channels=((32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256),(256, 256, 256),(256, 256, 256)),
180 | sparse_conv_kernel = (5, 3, 3, 3, 3),
181 | order=('conv', 'norm', 'act'),
182 | block_type='basicblock'),
183 |
184 | pts_bbox_head=dict(
185 | type='FSTRHead',
186 | in_channels=256,
187 | hidden_dim=256,
188 | downsample_scale=8,
189 | num_query=500,
190 | num_init_query=200,
191 | init_dn_query = False,
192 | init_learnable_query = False,
193 | init_query_topk = 1,
194 | init_query_radius = 1,
195 | gauusian_dn_sampling=False,
196 | noise_mean = 0.5,
197 | noise_std = 0.125,
198 | max_sparse_token_per_sample = 10000,
199 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
200 | tasks=[
201 | dict(num_class=10, class_names=[
202 | 'car', 'truck', 'construction_vehicle',
203 | 'bus', 'trailer', 'barrier',
204 | 'motorcycle', 'bicycle',
205 | 'pedestrian', 'traffic_cone'
206 | ]),
207 | ],
208 | bbox_coder=dict(
209 | type='MultiTaskBBoxCoder',
210 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
211 | pc_range=point_cloud_range,
212 | max_num=300,
213 | voxel_size=voxel_size,
214 | num_classes=10),
215 | separate_head=dict(
216 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
217 | transformer=dict(
218 | type='FSTRTransformer',
219 | decoder=dict(
220 | type='PETRTransformerDecoder',
221 | return_intermediate=True,
222 | num_layers=1,
223 | transformerlayers=dict(
224 | type='PETRTransformerDecoderLayer',
225 | attn_cfgs=[
226 | dict(
227 | type='MultiheadAttention',
228 | embed_dims=256,
229 | num_heads=8,
230 | dropout=0.1),
231 | dict(
232 | type='PETRMultiheadFlashAttention',
233 | embed_dims=256,
234 | num_heads=8,
235 | dropout=0.1),
236 | ],
237 | ffn_cfgs=dict(
238 | type='FFN',
239 | embed_dims=256,
240 | feedforward_channels=1024,
241 | num_fcs=2,
242 | ffn_drop=0.,
243 | act_cfg=dict(type='ReLU', inplace=True),
244 | ),
245 |
246 | feedforward_channels=1024, #unused
247 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
248 | 'ffn', 'norm')),
249 | )),
250 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
251 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
252 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
253 | ),
254 | train_cfg=dict(
255 | pts=dict(
256 | dataset='nuScenes',
257 | assigner=dict(
258 | type='HungarianAssigner3D',
259 | cls_cost=dict(type='FocalLossCost', weight=2.0),
260 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
261 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
262 | pc_range=point_cloud_range,
263 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
264 | ),
265 | pos_weight=-1,
266 | gaussian_overlap=0.1,
267 | min_radius=2,
268 | grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
269 | voxel_size=voxel_size,
270 | out_size_factor=out_size_factor,
271 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
272 | point_cloud_range=point_cloud_range)),
273 | test_cfg=dict(
274 | pts=dict(
275 | dataset='nuScenes',
276 | grid_size=[1440, 1440, 40],
277 | out_size_factor=out_size_factor,
278 | pc_range=point_cloud_range[0:2],
279 | voxel_size=voxel_size[:2],
280 | nms_type=None,
281 | nms_thr=0.1,
282 | use_rotate_nms=True,
283 | max_num=300
284 | )))
285 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
286 | optimizer_config = dict(
287 | type='CustomFp16OptimizerHook',
288 | loss_scale=512.,
289 | grad_clip=dict(max_norm=35, norm_type=2),
290 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
291 | lr_config = dict(
292 | policy='cyclic',
293 | target_ratio=(8, 0.0001),
294 | cyclic_times=1,
295 | step_ratio_up=0.4)
296 | momentum_config = dict(
297 | policy='cyclic',
298 | target_ratio=(0.8947368421052632, 1),
299 | cyclic_times=1,
300 | step_ratio_up=0.4)
301 | total_epochs = 20
302 | checkpoint_config = dict(interval=1)
303 | evaluation = dict(interval=5, pipeline=test_pipeline)
304 | log_config = dict(
305 | interval=50,
306 | hooks=[dict(type='TextLoggerHook'),
307 | dict(type='TensorboardLoggerHook')])
308 | dist_params = dict(backend='nccl')
309 | log_level = 'INFO'
310 | work_dir = None
311 | load_from = None
312 | resume_from = None
313 | workflow = [('train', 1)]
314 | gpu_ids = range(0, 8)
315 |
316 |
--------------------------------------------------------------------------------
/projects/configs/lidar/fstr_xlarge_voxel0050_cbgs_20e.py:
--------------------------------------------------------------------------------
1 | plugin=True
2 | plugin_dir='projects/mmdet3d_plugin/'
3 |
4 | point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
5 | class_names = [
6 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
7 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
8 | ]
9 | voxel_size = [0.050, 0.050, 0.2]
10 | out_size_factor = 8
11 | evaluation = dict(interval=20)
12 | dataset_type = 'CustomNuScenesDataset'
13 | data_root = 'data/nuscenes/'
14 | input_modality = dict(
15 | use_lidar=True,
16 | use_camera=False,
17 | use_radar=False,
18 | use_map=False,
19 | use_external=False)
20 | train_pipeline = [
21 | dict(
22 | type='LoadPointsFromFile',
23 | coord_type='LIDAR',
24 | load_dim=5,
25 | use_dim=[0, 1, 2, 3, 4],
26 | ),
27 | dict(
28 | type='LoadPointsFromMultiSweeps',
29 | sweeps_num=10,
30 | use_dim=[0, 1, 2, 3, 4],
31 | ),
32 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
33 | dict(
34 | type='ObjectSample',
35 | db_sampler=dict(
36 | data_root='data/nuscenes/',
37 | info_path='data/nuscenes/' + 'nuscenes_dbinfos_train.pkl',
38 | rate=1.0,
39 | prepare=dict(
40 | filter_by_difficulty=[-1],
41 | filter_by_min_points=dict(
42 | car=5,
43 | truck=5,
44 | bus=5,
45 | trailer=5,
46 | construction_vehicle=5,
47 | traffic_cone=5,
48 | barrier=5,
49 | motorcycle=5,
50 | bicycle=5,
51 | pedestrian=5)),
52 | classes=class_names,
53 | sample_groups=dict(
54 | car=2,
55 | truck=3,
56 | construction_vehicle=7,
57 | bus=4,
58 | trailer=6,
59 | barrier=2,
60 | motorcycle=6,
61 | bicycle=6,
62 | pedestrian=2,
63 | traffic_cone=2),
64 | points_loader=dict(
65 | type='LoadPointsFromFile',
66 | coord_type='LIDAR',
67 | load_dim=5,
68 | use_dim=[0, 1, 2, 3, 4],
69 | ))),
70 | dict(
71 | type='GlobalRotScaleTrans',
72 | rot_range=[-0.3925 * 2, 0.3925 * 2],
73 | scale_ratio_range=[0.9, 1.1],
74 | translation_std=[0.5, 0.5, 0.5]),
75 | dict(
76 | type='RandomFlip3D',
77 | sync_2d=False,
78 | flip_ratio_bev_horizontal=0.5,
79 | flip_ratio_bev_vertical=0.5),
80 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
81 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
82 | dict(type='ObjectNameFilter', classes=class_names),
83 | dict(type='PointShuffle'),
84 | dict(type='DefaultFormatBundle3D', class_names=class_names),
85 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'],
86 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
87 | 'depth2img', 'cam2img', 'pad_shape',
88 | 'scale_factor', 'flip', 'pcd_horizontal_flip',
89 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
90 | 'img_norm_cfg', 'pcd_trans', 'sample_idx',
91 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
92 | 'transformation_3d_flow', 'rot_degree',
93 | 'gt_bboxes_3d', 'gt_labels_3d'))
94 | ]
95 | test_pipeline = [
96 | dict(
97 | type='LoadPointsFromFile',
98 | coord_type='LIDAR',
99 | load_dim=5,
100 | use_dim=[0, 1, 2, 3, 4],
101 | ),
102 | dict(
103 | type='LoadPointsFromMultiSweeps',
104 | sweeps_num=10,
105 | use_dim=[0, 1, 2, 3, 4],
106 | ),
107 | dict(
108 | type='MultiScaleFlipAug3D',
109 | img_scale=(1333, 800),
110 | pts_scale_ratio=1,
111 | flip=False,
112 | transforms=[
113 | dict(
114 | type='GlobalRotScaleTrans',
115 | rot_range=[0, 0],
116 | scale_ratio_range=[1.0, 1.0],
117 | translation_std=[0, 0, 0]),
118 | dict(type='RandomFlip3D'),
119 | dict(
120 | type='DefaultFormatBundle3D',
121 | class_names=class_names,
122 | with_label=False),
123 | dict(type='Collect3D', keys=['points'])
124 | ])
125 | ]
126 | data = dict(
127 | samples_per_gpu=2,
128 | workers_per_gpu=4,
129 | train=dict(
130 | type='CBGSDataset',
131 | dataset=dict(
132 | type=dataset_type,
133 | data_root=data_root,
134 | ann_file=data_root + '/nuscenes_infos_train.pkl',
135 | load_interval=1,
136 | pipeline=train_pipeline,
137 | classes=class_names,
138 | modality=input_modality,
139 | test_mode=False,
140 | box_type_3d='LiDAR')),
141 | val=dict(
142 | type=dataset_type,
143 | data_root=data_root,
144 | ann_file=data_root + '/nuscenes_infos_val.pkl',
145 | load_interval=1,
146 | pipeline=test_pipeline,
147 | classes=class_names,
148 | modality=input_modality,
149 | test_mode=True,
150 | box_type_3d='LiDAR'),
151 | test=dict(
152 | type=dataset_type,
153 | data_root=data_root,
154 | ann_file=data_root + '/nuscenes_infos_val.pkl',
155 | load_interval=1,
156 | pipeline=test_pipeline,
157 | classes=class_names,
158 | modality=input_modality,
159 | test_mode=True,
160 | box_type_3d='LiDAR'))
161 | model = dict(
162 | type='FSTRDetector',
163 | pts_voxel_layer=dict(
164 | num_point_features=5,
165 | max_num_points=10,
166 | voxel_size=voxel_size,
167 | max_voxels=(120000, 160000),
168 | point_cloud_range=point_cloud_range),
169 | pts_voxel_encoder=dict(
170 | type='HardSimpleVFE',
171 | num_features=5,
172 | ),
173 | pts_middle_encoder=dict(
174 | type='VoxelNextEncoder',
175 | in_channels=5,
176 | sparse_shape=[41, 2160, 2160],
177 | base_channels=32,
178 | output_channels=256,
179 | encoder_channels=((32, 32), (64, 64, 64, 64), (128, 128, 128, 128, 128), (256, 256, 256, 256, 256, 256, 256),(256, 256, 256, 256),(256, 256, 256, 256)),
180 | encoder_paddings=((1, 1 ), (1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1, 1),(1, 1, 1, 1),(1, 1, 1, 1)),
181 | sparse_conv_kernel = (5, 3, 3, 3, 3),
182 | order=('conv', 'norm', 'act'),
183 | block_type='basicblock'),
184 |
185 | pts_bbox_head=dict(
186 | type='FSTRHead',
187 | in_channels=256,
188 | hidden_dim=256,
189 | downsample_scale=8,
190 | num_query=500,
191 | num_init_query=200,
192 | init_dn_query = False,
193 | init_learnable_query = False,
194 | init_query_topk = 1,
195 | init_query_radius = 1,
196 | gauusian_dn_sampling=False,
197 | noise_mean = 0.5,
198 | noise_std = 0.125,
199 | max_sparse_token_per_sample = 10000,
200 | common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
201 | tasks=[
202 | dict(num_class=10, class_names=[
203 | 'car', 'truck', 'construction_vehicle',
204 | 'bus', 'trailer', 'barrier',
205 | 'motorcycle', 'bicycle',
206 | 'pedestrian', 'traffic_cone'
207 | ]),
208 | ],
209 | bbox_coder=dict(
210 | type='MultiTaskBBoxCoder',
211 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
212 | pc_range=point_cloud_range,
213 | max_num=300,
214 | voxel_size=voxel_size,
215 | num_classes=10),
216 | separate_head=dict(
217 | type='SeparateTaskHead', init_bias=-2.19, final_kernel=3),
218 | transformer=dict(
219 | type='FSTRTransformer',
220 | decoder=dict(
221 | type='PETRTransformerDecoder',
222 | return_intermediate=True,
223 | num_layers=1,
224 | transformerlayers=dict(
225 | type='PETRTransformerDecoderLayer',
226 | attn_cfgs=[
227 | dict(
228 | type='MultiheadAttention',
229 | embed_dims=256,
230 | num_heads=8,
231 | dropout=0.1),
232 | dict(
233 | type='PETRMultiheadFlashAttention',
234 | embed_dims=256,
235 | num_heads=8,
236 | dropout=0.1),
237 | ],
238 | ffn_cfgs=dict(
239 | type='FFN',
240 | embed_dims=256,
241 | feedforward_channels=1024,
242 | num_fcs=2,
243 | ffn_drop=0.,
244 | act_cfg=dict(type='ReLU', inplace=True),
245 | ),
246 |
247 | feedforward_channels=1024, #unused
248 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
249 | 'ffn', 'norm')),
250 | )),
251 | loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
252 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
253 | loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
254 | ),
255 | train_cfg=dict(
256 | pts=dict(
257 | dataset='nuScenes',
258 | assigner=dict(
259 | type='HungarianAssigner3D',
260 | cls_cost=dict(type='FocalLossCost', weight=2.0),
261 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
262 | iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
263 | pc_range=point_cloud_range,
264 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
265 | ),
266 | pos_weight=-1,
267 | gaussian_overlap=0.1,
268 | min_radius=2,
269 | grid_size=[2160, 2160, 40], # [x_len, y_len, 1]
270 | voxel_size=voxel_size,
271 | out_size_factor=out_size_factor,
272 | code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
273 | point_cloud_range=point_cloud_range)),
274 | test_cfg=dict(
275 | pts=dict(
276 | dataset='nuScenes',
277 | grid_size=[2160, 2160, 40],
278 | out_size_factor=out_size_factor,
279 | pc_range=point_cloud_range[0:2],
280 | voxel_size=voxel_size[:2],
281 | nms_type=None,
282 | nms_thr=0.1,
283 | use_rotate_nms=True,
284 | max_num=300
285 | )))
286 | optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
287 | optimizer_config = dict(
288 | type='CustomFp16OptimizerHook',
289 | loss_scale=512.,
290 | grad_clip=dict(max_norm=35, norm_type=2),
291 | custom_fp16=dict(pts_voxel_encoder=False, pts_middle_encoder=False, pts_bbox_head=False))
292 | lr_config = dict(
293 | policy='cyclic',
294 | target_ratio=(8, 0.0001),
295 | cyclic_times=1,
296 | step_ratio_up=0.4)
297 | momentum_config = dict(
298 | policy='cyclic',
299 | target_ratio=(0.8947368421052632, 1),
300 | cyclic_times=1,
301 | step_ratio_up=0.4)
302 | total_epochs = 20
303 | checkpoint_config = dict(interval=1)
304 | evaluation = dict(interval=5, pipeline=test_pipeline)
305 | log_config = dict(
306 | interval=50,
307 | hooks=[dict(type='TextLoggerHook'),
308 | dict(type='TensorboardLoggerHook')])
309 | dist_params = dict(backend='nccl')
310 | log_level = 'INFO'
311 | work_dir = None
312 | load_from = None
313 | resume_from = None
314 | workflow = [('train', 1)]
315 | gpu_ids = range(0, 8)
316 |
317 |
--------------------------------------------------------------------------------
/tools/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from __future__ import division
3 | import argparse
4 | import copy
5 | import os
6 | import time
7 | import warnings
8 | from os import path as osp
9 |
10 | import mmcv
11 | import torch
12 | import torch.distributed as dist
13 | from mmcv import Config, DictAction
14 | from mmcv.runner import get_dist_info, init_dist
15 |
16 | from mmdet import __version__ as mmdet_version
17 | from mmdet3d import __version__ as mmdet3d_version
18 | from mmdet3d.apis import init_random_seed, train_model
19 | from mmdet3d.datasets import build_dataset
20 | from mmdet3d.models import build_model
21 | from mmdet3d.utils import collect_env, get_root_logger
22 | from mmdet.apis import set_random_seed
23 | from mmseg import __version__ as mmseg_version
24 |
25 | try:
26 | # If mmdet version > 2.20.0, setup_multi_processes would be imported and
27 | # used from mmdet instead of mmdet3d.
28 | from mmdet.utils import setup_multi_processes
29 | except ImportError:
30 | from mmdet3d.utils import setup_multi_processes
31 |
32 |
33 | def parse_args():
34 | parser = argparse.ArgumentParser(description='Train a detector')
35 | parser.add_argument('config', help='train config file path')
36 | parser.add_argument('--work-dir', help='the dir to save logs and models')
37 | parser.add_argument(
38 | '--resume-from', help='the checkpoint file to resume from')
39 | parser.add_argument(
40 | '--auto-resume',
41 | action='store_true',
42 | help='resume from the latest checkpoint automatically')
43 | parser.add_argument(
44 | '--no-validate',
45 | action='store_true',
46 | help='whether not to evaluate the checkpoint during training')
47 | group_gpus = parser.add_mutually_exclusive_group()
48 | group_gpus.add_argument(
49 | '--gpus',
50 | type=int,
51 | help='(Deprecated, please use --gpu-id) number of gpus to use '
52 | '(only applicable to non-distributed training)')
53 | group_gpus.add_argument(
54 | '--gpu-ids',
55 | type=int,
56 | nargs='+',
57 | help='(Deprecated, please use --gpu-id) ids of gpus to use '
58 | '(only applicable to non-distributed training)')
59 | group_gpus.add_argument(
60 | '--gpu-id',
61 | type=int,
62 | default=0,
63 | help='number of gpus to use '
64 | '(only applicable to non-distributed training)')
65 | parser.add_argument('--seed', type=int, default=0, help='random seed')
66 | parser.add_argument(
67 | '--diff-seed',
68 | action='store_true',
69 | help='Whether or not set different seeds for different ranks')
70 | parser.add_argument(
71 | '--deterministic',
72 | action='store_true',
73 | help='whether to set deterministic options for CUDNN backend.')
74 | parser.add_argument(
75 | '--options',
76 | nargs='+',
77 | action=DictAction,
78 | help='override some settings in the used config, the key-value pair '
79 | 'in xxx=yyy format will be merged into config file (deprecate), '
80 | 'change to --cfg-options instead.')
81 | parser.add_argument(
82 | '--cfg-options',
83 | nargs='+',
84 | action=DictAction,
85 | help='override some settings in the used config, the key-value pair '
86 | 'in xxx=yyy format will be merged into config file. If the value to '
87 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
88 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
89 | 'Note that the quotation marks are necessary and that no white space '
90 | 'is allowed.')
91 | parser.add_argument(
92 | '--launcher',
93 | choices=['none', 'pytorch', 'slurm', 'mpi'],
94 | default='none',
95 | help='job launcher')
96 | parser.add_argument('--local_rank', type=int, default=0)
97 | parser.add_argument(
98 | '--autoscale-lr',
99 | action='store_true',
100 | help='automatically scale lr with the number of gpus')
101 | args = parser.parse_args()
102 | if 'LOCAL_RANK' not in os.environ:
103 | os.environ['LOCAL_RANK'] = str(args.local_rank)
104 |
105 | if args.options and args.cfg_options:
106 | raise ValueError(
107 | '--options and --cfg-options cannot be both specified, '
108 | '--options is deprecated in favor of --cfg-options')
109 | if args.options:
110 | warnings.warn('--options is deprecated in favor of --cfg-options')
111 | args.cfg_options = args.options
112 |
113 | return args
114 |
115 |
116 | def main():
117 | args = parse_args()
118 |
119 | cfg = Config.fromfile(args.config)
120 | if args.cfg_options is not None:
121 | cfg.merge_from_dict(args.cfg_options)
122 |
123 | # set multi-process settings
124 | setup_multi_processes(cfg)
125 |
126 | if cfg.get('custom_imports', None):
127 | from mmcv.utils import import_modules_from_strings
128 | import_modules_from_strings(**cfg['custom_imports'])
129 |
130 | # import modules from plguin/xx, registry will be updated
131 | if hasattr(cfg, 'plugin'):
132 | if cfg.plugin:
133 | import importlib
134 | if hasattr(cfg, 'plugin_dir'):
135 | plugin_dir = cfg.plugin_dir
136 | _module_dir = os.path.dirname(plugin_dir)
137 | _module_dir = _module_dir.split('/')
138 | _module_path = _module_dir[0]
139 |
140 | for m in _module_dir[1:]:
141 | _module_path = _module_path + '.' + m
142 | print(_module_path)
143 | plg_lib = importlib.import_module(_module_path)
144 | else:
145 | # import dir is the dirpath for the config file
146 | _module_dir = os.path.dirname(args.config)
147 | _module_dir = _module_dir.split('/')
148 | _module_path = _module_dir[0]
149 | for m in _module_dir[1:]:
150 | _module_path = _module_path + '.' + m
151 | print(_module_path)
152 | plg_lib = importlib.import_module(_module_path)
153 |
154 | plg_lib = importlib.import_module('mmdetection3d.mmdet3d')
155 |
156 | # set cudnn_benchmark
157 | if cfg.get('cudnn_benchmark', False):
158 | torch.backends.cudnn.benchmark = True
159 |
160 | # work_dir is determined in this priority: CLI > segment in file > filename
161 | if args.work_dir is not None:
162 | # update configs according to CLI args if args.work_dir is not None
163 | cfg.work_dir = args.work_dir
164 | elif cfg.get('work_dir', None) is None:
165 | # use config filename as default work_dir if cfg.work_dir is None
166 | cfg.work_dir = osp.join('./work_dirs',
167 | osp.splitext(osp.basename(args.config))[0])
168 | if args.resume_from is not None:
169 | cfg.resume_from = args.resume_from
170 |
171 | if args.auto_resume:
172 | cfg.auto_resume = args.auto_resume
173 | warnings.warn('`--auto-resume` is only supported when mmdet'
174 | 'version >= 2.20.0 for 3D detection model or'
175 | 'mmsegmentation verision >= 0.21.0 for 3D'
176 | 'segmentation model')
177 |
178 | if args.gpus is not None:
179 | cfg.gpu_ids = range(1)
180 | warnings.warn('`--gpus` is deprecated because we only support '
181 | 'single GPU mode in non-distributed training. '
182 | 'Use `gpus=1` now.')
183 | if args.gpu_ids is not None:
184 | cfg.gpu_ids = args.gpu_ids[0:1]
185 | warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
186 | 'Because we only support single GPU mode in '
187 | 'non-distributed training. Use the first GPU '
188 | 'in `gpu_ids` now.')
189 | if args.gpus is None and args.gpu_ids is None:
190 | cfg.gpu_ids = [args.gpu_id]
191 |
192 | if args.autoscale_lr:
193 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
194 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
195 |
196 | # init distributed env first, since logger depends on the dist info.
197 | if args.launcher == 'none':
198 | distributed = False
199 | else:
200 | distributed = True
201 | init_dist(args.launcher, **cfg.dist_params)
202 | # re-set gpu_ids with distributed training mode
203 | _, world_size = get_dist_info()
204 | cfg.gpu_ids = range(world_size)
205 |
206 | # create work_dir
207 | mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
208 | # dump config
209 | cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
210 | # init the logger before other steps
211 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
212 | log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
213 | # specify logger name, if we still use 'mmdet', the output info will be
214 | # filtered and won't be saved in the log_file
215 | # TODO: ugly workaround to judge whether we are training det or seg model
216 | if cfg.model.type in ['EncoderDecoder3D']:
217 | logger_name = 'mmseg'
218 | else:
219 | logger_name = 'mmdet'
220 | logger = get_root_logger(
221 | log_file=log_file, log_level=cfg.log_level, name=logger_name)
222 |
223 | # init the meta dict to record some important information such as
224 | # environment info and seed, which will be logged
225 | meta = dict()
226 | # log env info
227 | env_info_dict = collect_env()
228 | env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
229 | dash_line = '-' * 60 + '\n'
230 | logger.info('Environment info:\n' + dash_line + env_info + '\n' +
231 | dash_line)
232 | meta['env_info'] = env_info
233 | meta['config'] = cfg.pretty_text
234 |
235 | # log some basic info
236 | logger.info(f'Distributed training: {distributed}')
237 | logger.info(f'Config:\n{cfg.pretty_text}')
238 |
239 | # set random seeds
240 | seed = init_random_seed(args.seed)
241 | seed = seed + dist.get_rank() if args.diff_seed else seed
242 | logger.info(f'Set random seed to {seed}, '
243 | f'deterministic: {args.deterministic}')
244 | set_random_seed(seed, deterministic=args.deterministic)
245 | cfg.seed = seed
246 | meta['seed'] = seed
247 | meta['exp_name'] = osp.basename(args.config)
248 |
249 | model = build_model(
250 | cfg.model,
251 | train_cfg=cfg.get('train_cfg'),
252 | test_cfg=cfg.get('test_cfg'))
253 | model.init_weights()
254 |
255 | logger.info(f'Model:\n{model}')
256 | datasets = [build_dataset(cfg.data.train)]
257 | if len(cfg.workflow) == 2:
258 | val_dataset = copy.deepcopy(cfg.data.val)
259 | # in case we use a dataset wrapper
260 | if 'dataset' in cfg.data.train:
261 | val_dataset.pipeline = cfg.data.train.dataset.pipeline
262 | else:
263 | val_dataset.pipeline = cfg.data.train.pipeline
264 | # set test_mode=False here in deep copied config
265 | # which do not affect AP/AR calc ulation later
266 | # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
267 | val_dataset.test_mode = False
268 | datasets.append(build_dataset(val_dataset))
269 | if cfg.checkpoint_config is not None:
270 | # save mmdet version, config file content and class names in
271 | # checkpoints as meta data
272 | cfg.checkpoint_config.meta = dict(
273 | mmdet_version=mmdet_version,
274 | mmseg_version=mmseg_version,
275 | mmdet3d_version=mmdet3d_version,
276 | config=cfg.pretty_text,
277 | CLASSES=datasets[0].CLASSES,
278 | PALETTE=datasets[0].PALETTE # for segmentors
279 | if hasattr(datasets[0], 'PALETTE') else None)
280 | # add an attribute for visualization convenience
281 | model.CLASSES = datasets[0].CLASSES
282 | train_model(
283 | model,
284 | datasets,
285 | cfg,
286 | distributed=distributed,
287 | validate=(not args.no_validate),
288 | timestamp=timestamp,
289 | meta=meta)
290 |
291 |
292 | if __name__ == '__main__':
293 | main()
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/cmt_transformer.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from DETR3D (https://github.com/WangYueFt/detr3d)
5 | # Copyright (c) 2021 Wang, Yue
6 | # ------------------------------------------------------------------------
7 | # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
8 | # Copyright (c) OpenMMLab. All rights reserved.
9 | # ------------------------------------------------------------------------
10 |
11 | import math
12 | import copy
13 | import warnings
14 | import torch
15 | import torch.nn as nn
16 | import torch.nn.functional as F
17 | import torch.utils.checkpoint as cp
18 |
19 | from typing import Sequence
20 | from einops import rearrange
21 | from mmcv.cnn.bricks.drop import build_dropout
22 | from mmcv.runner.base_module import BaseModule
23 | from mmcv.cnn.bricks.transformer import (
24 | BaseTransformerLayer,
25 | TransformerLayerSequence,
26 | build_transformer_layer_sequence
27 | )
28 | from mmcv.cnn import (
29 | build_activation_layer,
30 | build_conv_layer,
31 | build_norm_layer,
32 | xavier_init
33 | )
34 | from mmcv.cnn.bricks.registry import (
35 | ATTENTION,
36 | TRANSFORMER_LAYER,
37 | TRANSFORMER_LAYER_SEQUENCE
38 | )
39 | from mmcv.utils import (
40 | ConfigDict,
41 | build_from_cfg,
42 | deprecated_api_warning,
43 | to_2tuple
44 | )
45 | from mmdet.models.utils.builder import TRANSFORMER
46 |
47 |
48 | @TRANSFORMER.register_module()
49 | class CmtTransformer(BaseModule):
50 | """Implements the DETR transformer.
51 | Following the official DETR implementation, this module copy-paste
52 | from torch.nn.Transformer with modifications:
53 | * positional encodings are passed in MultiheadAttention
54 | * extra LN at the end of encoder is removed
55 | * decoder returns a stack of activations from all decoding layers
56 | See `paper: End-to-End Object Detection with Transformers
57 | `_ for details.
58 | Args:
59 | encoder (`mmcv.ConfigDict` | Dict): Config of
60 | TransformerEncoder. Defaults to None.
61 | decoder ((`mmcv.ConfigDict` | Dict)): Config of
62 | TransformerDecoder. Defaults to None
63 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
64 | Defaults to None.
65 | """
66 |
67 | def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
68 | super(CmtTransformer, self).__init__(init_cfg=init_cfg)
69 | if encoder is not None:
70 | self.encoder = build_transformer_layer_sequence(encoder)
71 | else:
72 | self.encoder = None
73 | self.decoder = build_transformer_layer_sequence(decoder)
74 | self.embed_dims = self.decoder.embed_dims
75 | self.cross = cross
76 |
77 | def init_weights(self):
78 | # follow the official DETR to init parameters
79 | for m in self.modules():
80 | if hasattr(m, 'weight') and m.weight.dim() > 1:
81 | xavier_init(m, distribution='uniform')
82 | self._is_init = True
83 |
84 | def forward(self, x, x_img, query_embed, bev_pos_embed, rv_pos_embed, attn_masks=None, reg_branch=None):
85 | """Forward function for `Transformer`.
86 | Args:
87 | x (Tensor): Input query with shape [bs, c, h, w] where
88 | c = embed_dims.
89 | mask (Tensor): The key_padding_mask used for encoder and decoder,
90 | with shape [bs, h, w].
91 | query_embed (Tensor): The query embedding for decoder, with shape
92 | [num_query, c].
93 | pos_embed (Tensor): The positional encoding for encoder and
94 | decoder, with the same shape as `x`.
95 | Returns:
96 | tuple[Tensor]: results of decoder containing the following tensor.
97 | - out_dec: Output from decoder. If return_intermediate_dec \
98 | is True output has shape [num_dec_layers, bs,
99 | num_query, embed_dims], else has shape [1, bs, \
100 | num_query, embed_dims].
101 | - memory: Output results from encoder, with shape \
102 | [bs, embed_dims, h, w].
103 | """
104 | bs, c, h, w = x.shape
105 | bev_memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
106 | rv_memory = rearrange(x_img, "(bs v) c h w -> (v h w) bs c", bs=bs)
107 | bev_pos_embed = bev_pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c]
108 | rv_pos_embed = rearrange(rv_pos_embed, "(bs v) h w c -> (v h w) bs c", bs=bs)
109 |
110 | memory, pos_embed = torch.cat([bev_memory, rv_memory], dim=0), torch.cat([bev_pos_embed, rv_pos_embed], dim=0)
111 | query_embed = query_embed.transpose(0, 1) # [num_query, dim] -> [num_query, bs, dim]
112 | mask = memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w]
113 |
114 | target = torch.zeros_like(query_embed)
115 | # out_dec: [num_layers, num_query, bs, dim]
116 | out_dec = self.decoder(
117 | query=target,
118 | key=memory,
119 | value=memory,
120 | key_pos=pos_embed,
121 | query_pos=query_embed,
122 | key_padding_mask=mask,
123 | attn_masks=[attn_masks, None],
124 | reg_branch=reg_branch,
125 | )
126 | out_dec = out_dec.transpose(1, 2)
127 | return out_dec, memory
128 |
129 |
130 | @TRANSFORMER.register_module()
131 | class CmtLidarTransformer(BaseModule):
132 | """Implements the DETR transformer.
133 | Following the official DETR implementation, this module copy-paste
134 | from torch.nn.Transformer with modifications:
135 | * positional encodings are passed in MultiheadAttention
136 | * extra LN at the end of encoder is removed
137 | * decoder returns a stack of activations from all decoding layers
138 | See `paper: End-to-End Object Detection with Transformers
139 | `_ for details.
140 | Args:
141 | encoder (`mmcv.ConfigDict` | Dict): Config of
142 | TransformerEncoder. Defaults to None.
143 | decoder ((`mmcv.ConfigDict` | Dict)): Config of
144 | TransformerDecoder. Defaults to None
145 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
146 | Defaults to None.
147 | """
148 |
149 | def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
150 | super(CmtLidarTransformer, self).__init__(init_cfg=init_cfg)
151 | if encoder is not None:
152 | self.encoder = build_transformer_layer_sequence(encoder)
153 | else:
154 | self.encoder = None
155 | self.decoder = build_transformer_layer_sequence(decoder)
156 | self.embed_dims = self.decoder.embed_dims
157 | self.cross = cross
158 |
159 | def init_weights(self):
160 | # follow the official DETR to init parameters
161 | for m in self.modules():
162 | if hasattr(m, 'weight') and m.weight.dim() > 1:
163 | xavier_init(m, distribution='uniform')
164 | self._is_init = True
165 |
166 | def forward(self, x, mask, query_embed, pos_embed, attn_masks=None, reg_branch=None):
167 | """Forward function for `Transformer`.
168 | Args:
169 | x (Tensor): Input query with shape [bs, c, h, w] where
170 | c = embed_dims.
171 | mask (Tensor): The key_padding_mask used for encoder and decoder,
172 | with shape [bs, h, w].
173 | query_embed (Tensor): The query embedding for decoder, with shape
174 | [num_query, c].
175 | pos_embed (Tensor): The positional encoding for encoder and
176 | decoder, with the same shape as `x`.
177 | Returns:
178 | tuple[Tensor]: results of decoder containing the following tensor.
179 | - out_dec: Output from decoder. If return_intermediate_dec \
180 | is True output has shape [num_dec_layers, bs,
181 | num_query, embed_dims], else has shape [1, bs, \
182 | num_query, embed_dims].
183 | - memory: Output results from encoder, with shape \
184 | [bs, embed_dims, h, w].
185 | """
186 | bs, c, h, w = x.shape
187 | memory = rearrange(x, "bs c h w -> (h w) bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
188 | pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1) # [bs, n, c, h, w] -> [n*h*w, bs, c]
189 | query_embed = query_embed.transpose(0, 1) # [num_query, dim] -> [num_query, bs, dim]
190 | mask = mask.view(bs, -1) # [bs, n, h, w] -> [bs, n*h*w]
191 | target = torch.zeros_like(query_embed)
192 | # out_dec: [num_layers, num_query, bs, dim]
193 | out_dec = self.decoder(
194 | query=target,
195 | key=memory,
196 | value=memory,
197 | key_pos=pos_embed,
198 | query_pos=query_embed,
199 | key_padding_mask=mask,
200 | attn_masks=[attn_masks, None],
201 | reg_branch=reg_branch,
202 | )
203 | out_dec = out_dec.transpose(1, 2)
204 | return out_dec, memory
205 |
206 |
207 |
208 | @TRANSFORMER.register_module()
209 | class FSTRTransformer(CmtLidarTransformer):
210 | """Implements the DETR transformer.
211 | Following the official DETR implementation, this module copy-paste
212 | from torch.nn.Transformer with modifications:
213 | * positional encodings are passed in MultiheadAttention
214 | * extra LN at the end of encoder is removed
215 | * decoder returns a stack of activations from all decoding layers
216 | See `paper: End-to-End Object Detection with Transformers
217 | `_ for details.
218 | Args:
219 | encoder (`mmcv.ConfigDict` | Dict): Config of
220 | TransformerEncoder. Defaults to None.
221 | decoder ((`mmcv.ConfigDict` | Dict)): Config of
222 | TransformerDecoder. Defaults to None
223 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
224 | Defaults to None.
225 | """
226 |
227 | def __init__(self, **kwargs):
228 | super(FSTRTransformer, self).__init__(**kwargs)
229 |
230 | def forward(self, x, query_embed, bev_pos_embed, attn_masks=None, bev_key_padding_mask=None, reg_branch=None, target = None):
231 | """Forward function for `Transformer`.
232 | Args:
233 | x (Tensor): Input query with shape [bs, c, h, w] where
234 | c = embed_dims.
235 | mask (Tensor): The key_padding_mask used for encoder and decoder,
236 | with shape [bs, h, w].
237 | query_embed (Tensor): The query embedding for decoder, with shape
238 | [num_query, c].
239 | pos_embed (Tensor): The positional encoding for encoder and
240 | decoder, with the same shape as `x`.
241 | Returns:
242 | tuple[Tensor]: results of decoder containing the following tensor.
243 | - out_dec: Output from decoder. If return_intermediate_dec \
244 | is True output has shape [num_dec_layers, bs,
245 | num_query, embed_dims], else has shape [1, bs, \
246 | num_query, embed_dims].
247 | - memory: Output results from encoder, with shape \
248 | [bs, embed_dims, h, w].
249 | """
250 | bs, n, c = x.shape
251 | bev_memory = rearrange(x, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
252 | bev_pos_embed = rearrange(bev_pos_embed, "bs n c -> n bs c") # [bs, n, c, h, w] -> [n*h*w, bs, c]
253 |
254 | memory, pos_embed = bev_memory, bev_pos_embed
255 | query_embed = query_embed.transpose(0, 1) # [bs, num_query, dim] -> [num_query, bs, dim]
256 |
257 | if bev_key_padding_mask is None:
258 | mask = memory.new_zeros(bs, memory.shape[0]) # [bs, n, h, w] -> [bs, n*h*w]
259 | else:
260 | mask = bev_key_padding_mask
261 |
262 | assert target is not None
263 | out_dec = self.decoder(
264 | query=target,
265 | key=memory,
266 | value=memory,
267 | key_pos=pos_embed,
268 | query_pos=query_embed,
269 | key_padding_mask=mask,
270 | attn_masks=[attn_masks, None],
271 | reg_branch=reg_branch,
272 | )
273 | out_dec = out_dec.transpose(1, 2)
274 | return out_dec, memory
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/utils/petr_transformer.py:
--------------------------------------------------------------------------------
1 | import math
2 | import copy
3 | import warnings
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | import torch.utils.checkpoint as cp
8 |
9 | from einops import rearrange
10 | from mmcv.cnn.bricks.drop import build_dropout
11 | from mmcv.runner.base_module import BaseModule
12 |
13 | from mmcv.cnn.bricks.transformer import (
14 | BaseTransformerLayer,
15 | TransformerLayerSequence,
16 | build_transformer_layer_sequence
17 | )
18 | from mmcv.cnn import (
19 | build_activation_layer,
20 | build_conv_layer,
21 | build_norm_layer,
22 | xavier_init
23 | )
24 | from mmcv.cnn.bricks.registry import (
25 | ATTENTION,TRANSFORMER_LAYER,
26 | TRANSFORMER_LAYER_SEQUENCE
27 | )
28 | from mmcv.utils import (
29 | ConfigDict,
30 | build_from_cfg,
31 | deprecated_api_warning,
32 | to_2tuple
33 | )
34 | from mmdet.models.utils.builder import TRANSFORMER
35 |
36 |
37 | @ATTENTION.register_module()
38 | class PETRMultiheadAttention(BaseModule):
39 | """A wrapper for ``torch.nn.MultiheadAttention``.
40 | This module implements MultiheadAttention with identity connection,
41 | and positional encoding is also passed as input.
42 | Args:
43 | embed_dims (int): The embedding dimension.
44 | num_heads (int): Parallel attention heads.
45 | attn_drop (float): A Dropout layer on attn_output_weights.
46 | Default: 0.0.
47 | proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
48 | Default: 0.0.
49 | dropout_layer (obj:`ConfigDict`): The dropout_layer used
50 | when adding the shortcut.
51 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
52 | Default: None.
53 | batch_first (bool): When it is True, Key, Query and Value are shape of
54 | (batch, n, embed_dim), otherwise (n, batch, embed_dim).
55 | Default to False.
56 | """
57 |
58 | def __init__(self,
59 | embed_dims,
60 | num_heads,
61 | attn_drop=0.,
62 | proj_drop=0.,
63 | dropout_layer=dict(type='Dropout', drop_prob=0.),
64 | init_cfg=None,
65 | batch_first=False,
66 | **kwargs):
67 | super(PETRMultiheadAttention, self).__init__(init_cfg)
68 | if 'dropout' in kwargs:
69 | warnings.warn(
70 | 'The arguments `dropout` in MultiheadAttention '
71 | 'has been deprecated, now you can separately '
72 | 'set `attn_drop`(float), proj_drop(float), '
73 | 'and `dropout_layer`(dict) ', DeprecationWarning)
74 | attn_drop = kwargs['dropout']
75 | dropout_layer['drop_prob'] = kwargs.pop('dropout')
76 |
77 | self.embed_dims = embed_dims
78 | self.num_heads = num_heads
79 | self.batch_first = batch_first
80 |
81 | self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
82 | **kwargs)
83 |
84 | self.proj_drop = nn.Dropout(proj_drop)
85 | self.dropout_layer = build_dropout(
86 | dropout_layer) if dropout_layer else nn.Identity()
87 |
88 | @deprecated_api_warning({'residual': 'identity'},
89 | cls_name='MultiheadAttention')
90 | def forward(self,
91 | query,
92 | key=None,
93 | value=None,
94 | identity=None,
95 | query_pos=None,
96 | key_pos=None,
97 | attn_mask=None,
98 | key_padding_mask=None,
99 | **kwargs):
100 | """Forward function for `MultiheadAttention`.
101 | **kwargs allow passing a more general data flow when combining
102 | with other operations in `transformerlayer`.
103 | Args:
104 | query (Tensor): The input query with shape [num_queries, bs,
105 | embed_dims] if self.batch_first is False, else
106 | [bs, num_queries embed_dims].
107 | key (Tensor): The key tensor with shape [num_keys, bs,
108 | embed_dims] if self.batch_first is False, else
109 | [bs, num_keys, embed_dims] .
110 | If None, the ``query`` will be used. Defaults to None.
111 | value (Tensor): The value tensor with same shape as `key`.
112 | Same in `nn.MultiheadAttention.forward`. Defaults to None.
113 | If None, the `key` will be used.
114 | identity (Tensor): This tensor, with the same shape as x,
115 | will be used for the identity link.
116 | If None, `x` will be used. Defaults to None.
117 | query_pos (Tensor): The positional encoding for query, with
118 | the same shape as `x`. If not None, it will
119 | be added to `x` before forward function. Defaults to None.
120 | key_pos (Tensor): The positional encoding for `key`, with the
121 | same shape as `key`. Defaults to None. If not None, it will
122 | be added to `key` before forward function. If None, and
123 | `query_pos` has the same shape as `key`, then `query_pos`
124 | will be used for `key_pos`. Defaults to None.
125 | attn_mask (Tensor): ByteTensor mask with shape [num_queries,
126 | num_keys]. Same in `nn.MultiheadAttention.forward`.
127 | Defaults to None.
128 | key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
129 | Defaults to None.
130 | Returns:
131 | Tensor: forwarded results with shape
132 | [num_queries, bs, embed_dims]
133 | if self.batch_first is False, else
134 | [bs, num_queries embed_dims].
135 | """
136 |
137 | if key is None:
138 | key = query
139 | if value is None:
140 | value = key
141 | if identity is None:
142 | identity = query
143 | if key_pos is None:
144 | if query_pos is not None:
145 | # use query_pos if key_pos is not available
146 | if query_pos.shape == key.shape:
147 | key_pos = query_pos
148 | else:
149 | warnings.warn(f'position encoding of key is'
150 | f'missing in {self.__class__.__name__}.')
151 | if query_pos is not None:
152 | query = query + query_pos
153 | if key_pos is not None:
154 | key = key + key_pos
155 |
156 | # Because the dataflow('key', 'query', 'value') of
157 | # ``torch.nn.MultiheadAttention`` is (num_query, batch,
158 | # embed_dims), We should adjust the shape of dataflow from
159 | # batch_first (batch, num_query, embed_dims) to num_query_first
160 | # (num_query ,batch, embed_dims), and recover ``attn_output``
161 | # from num_query_first to batch_first.
162 | if self.batch_first:
163 | query = query.transpose(0, 1)
164 | key = key.transpose(0, 1)
165 | value = value.transpose(0, 1)
166 |
167 | out = self.attn(
168 | query=query,
169 | key=key,
170 | value=value,
171 | attn_mask=attn_mask,
172 | key_padding_mask=key_padding_mask)[0]
173 |
174 | if self.batch_first:
175 | out = out.transpose(0, 1)
176 |
177 | return identity + self.dropout_layer(self.proj_drop(out))
178 |
179 |
180 | from .attention import FlashMHA
181 |
182 | @ATTENTION.register_module()
183 | class PETRMultiheadFlashAttention(BaseModule):
184 | """A wrapper for ``torch.nn.MultiheadAttention``.
185 | This module implements MultiheadAttention with identity connection,
186 | and positional encoding is also passed as input.
187 | Args:
188 | embed_dims (int): The embedding dimension.
189 | num_heads (int): Parallel attention heads.
190 | attn_drop (float): A Dropout layer on attn_output_weights.
191 | Default: 0.0.
192 | proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
193 | Default: 0.0.
194 | dropout_layer (obj:`ConfigDict`): The dropout_layer used
195 | when adding the shortcut.
196 | init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
197 | Default: None.
198 | batch_first (bool): When it is True, Key, Query and Value are shape of
199 | (batch, n, embed_dim), otherwise (n, batch, embed_dim).
200 | Default to False.
201 | """
202 |
203 | def __init__(self,
204 | embed_dims,
205 | num_heads,
206 | attn_drop=0.,
207 | proj_drop=0.,
208 | dropout_layer=dict(type='Dropout', drop_prob=0.),
209 | init_cfg=None,
210 | batch_first=True,
211 | **kwargs):
212 | super(PETRMultiheadFlashAttention, self).__init__(init_cfg)
213 | if 'dropout' in kwargs:
214 | warnings.warn(
215 | 'The arguments `dropout` in MultiheadAttention '
216 | 'has been deprecated, now you can separately '
217 | 'set `attn_drop`(float), proj_drop(float), '
218 | 'and `dropout_layer`(dict) ', DeprecationWarning)
219 | attn_drop = kwargs['dropout']
220 | dropout_layer['drop_prob'] = kwargs.pop('dropout')
221 |
222 | self.embed_dims = embed_dims
223 | self.num_heads = num_heads
224 | self.batch_first = True
225 |
226 | self.attn = FlashMHA(embed_dims, num_heads, attn_drop, dtype=torch.float16, device='cuda',
227 | **kwargs)
228 |
229 | self.proj_drop = nn.Dropout(proj_drop)
230 | self.dropout_layer = build_dropout(
231 | dropout_layer) if dropout_layer else nn.Identity()
232 |
233 | @deprecated_api_warning({'residual': 'identity'},
234 | cls_name='MultiheadAttention')
235 | def forward(self,
236 | query,
237 | key=None,
238 | value=None,
239 | identity=None,
240 | query_pos=None,
241 | key_pos=None,
242 | attn_mask=None,
243 | key_padding_mask=None,
244 | **kwargs):
245 | """Forward function for `MultiheadAttention`.
246 | **kwargs allow passing a more general data flow when combining
247 | with other operations in `transformerlayer`.
248 | Args:
249 | query (Tensor): The input query with shape [num_queries, bs,
250 | embed_dims] if self.batch_first is False, else
251 | [bs, num_queries embed_dims].
252 | key (Tensor): The key tensor with shape [num_keys, bs,
253 | embed_dims] if self.batch_first is False, else
254 | [bs, num_keys, embed_dims] .
255 | If None, the ``query`` will be used. Defaults to None.
256 | value (Tensor): The value tensor with same shape as `key`.
257 | Same in `nn.MultiheadAttention.forward`. Defaults to None.
258 | If None, the `key` will be used.
259 | identity (Tensor): This tensor, with the same shape as x,
260 | will be used for the identity link.
261 | If None, `x` will be used. Defaults to None.
262 | query_pos (Tensor): The positional encoding for query, with
263 | the same shape as `x`. If not None, it will
264 | be added to `x` before forward function. Defaults to None.
265 | key_pos (Tensor): The positional encoding for `key`, with the
266 | same shape as `key`. Defaults to None. If not None, it will
267 | be added to `key` before forward function. If None, and
268 | `query_pos` has the same shape as `key`, then `query_pos`
269 | will be used for `key_pos`. Defaults to None.
270 | attn_mask (Tensor): ByteTensor mask with shape [num_queries,
271 | num_keys]. Same in `nn.MultiheadAttention.forward`.
272 | Defaults to None.
273 | key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
274 | Defaults to None.
275 | Returns:
276 | Tensor: forwarded results with shape
277 | [num_queries, bs, embed_dims]
278 | if self.batch_first is False, else
279 | [bs, num_queries embed_dims].
280 | """
281 |
282 | if key is None:
283 | key = query
284 | if value is None:
285 | value = key
286 | if identity is None:
287 | identity = query
288 | if key_pos is None:
289 | if query_pos is not None:
290 | # use query_pos if key_pos is not available
291 | if query_pos.shape == key.shape:
292 | key_pos = query_pos
293 | else:
294 | warnings.warn(f'position encoding of key is'
295 | f'missing in {self.__class__.__name__}.')
296 | if query_pos is not None:
297 | query = query + query_pos
298 | if key_pos is not None:
299 | key = key + key_pos
300 |
301 | # Because the dataflow('key', 'query', 'value') of
302 | # ``torch.nn.MultiheadAttention`` is (num_query, batch,
303 | # embed_dims), We should adjust the shape of dataflow from
304 | # batch_first (batch, num_query, embed_dims) to num_query_first
305 | # (num_query ,batch, embed_dims), and recover ``attn_output``
306 | # from num_query_first to batch_first.
307 | if self.batch_first:
308 | query = query.transpose(0, 1)
309 | key = key.transpose(0, 1)
310 | value = value.transpose(0, 1)
311 |
312 | out = self.attn(
313 | q=query,
314 | k=key,
315 | v=value,
316 | key_padding_mask=None)[0]
317 |
318 | if self.batch_first:
319 | out = out.transpose(0, 1)
320 |
321 | return identity + self.dropout_layer(self.proj_drop(out))
322 |
323 |
324 | @TRANSFORMER_LAYER_SEQUENCE.register_module()
325 | class PETRTransformerDecoder(TransformerLayerSequence):
326 | """Implements the decoder in DETR transformer.
327 | Args:
328 | return_intermediate (bool): Whether to return intermediate outputs.
329 | post_norm_cfg (dict): Config of last normalization layer. Default:
330 | `LN`.
331 | """
332 |
333 | def __init__(self,
334 | *args,
335 | post_norm_cfg=dict(type='LN'),
336 | return_intermediate=False,
337 | **kwargs):
338 |
339 | super(PETRTransformerDecoder, self).__init__(*args, **kwargs)
340 | self.return_intermediate = return_intermediate
341 | if post_norm_cfg is not None:
342 | self.post_norm = build_norm_layer(post_norm_cfg,
343 | self.embed_dims)[1]
344 | else:
345 | self.post_norm = None
346 |
347 | def forward(self, query, *args, **kwargs):
348 | """Forward function for `TransformerDecoder`.
349 | Args:
350 | query (Tensor): Input query with shape
351 | `(num_query, bs, embed_dims)`.
352 | Returns:
353 | Tensor: Results with shape [1, num_query, bs, embed_dims] when
354 | return_intermediate is `False`, otherwise it has shape
355 | [num_layers, num_query, bs, embed_dims].
356 | """
357 | if not self.return_intermediate:
358 | x = super().forward(query, *args, **kwargs)
359 | if self.post_norm:
360 | x = self.post_norm(x)[None]
361 | return x
362 |
363 | intermediate = []
364 | for layer in self.layers:
365 | query = layer(query, *args, **kwargs)
366 | if self.return_intermediate:
367 | if self.post_norm is not None:
368 | intermediate.append(self.post_norm(query))
369 | else:
370 | intermediate.append(query)
371 | return torch.stack(intermediate)
372 |
373 |
374 | @TRANSFORMER_LAYER.register_module()
375 | class PETRTransformerDecoderLayer(BaseTransformerLayer):
376 | """Implements decoder layer in DETR transformer.
377 | Args:
378 | attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
379 | Configs for self_attention or cross_attention, the order
380 | should be consistent with it in `operation_order`. If it is
381 | a dict, it would be expand to the number of attention in
382 | `operation_order`.
383 | feedforward_channels (int): The hidden dimension for FFNs.
384 | ffn_dropout (float): Probability of an element to be zeroed
385 | in ffn. Default 0.0.
386 | operation_order (tuple[str]): The execution order of operation
387 | in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
388 | Default:None
389 | act_cfg (dict): The activation config for FFNs. Default: `LN`
390 | norm_cfg (dict): Config dict for normalization layer.
391 | Default: `LN`.
392 | ffn_num_fcs (int): The number of fully-connected layers in FFNs.
393 | Default:2.
394 | """
395 |
396 | def __init__(self,
397 | attn_cfgs,
398 | feedforward_channels,
399 | ffn_dropout=0.0,
400 | operation_order=None,
401 | act_cfg=dict(type='ReLU', inplace=True),
402 | norm_cfg=dict(type='LN'),
403 | ffn_num_fcs=2,
404 | with_cp=True,
405 | **kwargs):
406 | super(PETRTransformerDecoderLayer, self).__init__(
407 | attn_cfgs=attn_cfgs,
408 | feedforward_channels=feedforward_channels,
409 | ffn_dropout=ffn_dropout,
410 | operation_order=operation_order,
411 | act_cfg=act_cfg,
412 | norm_cfg=norm_cfg,
413 | ffn_num_fcs=ffn_num_fcs,
414 | **kwargs)
415 | assert len(operation_order) == 6
416 | assert set(operation_order) == set(
417 | ['self_attn', 'norm', 'cross_attn', 'ffn'])
418 | self.use_checkpoint = with_cp
419 |
420 | def _forward(self,
421 | query,
422 | key=None,
423 | value=None,
424 | query_pos=None,
425 | key_pos=None,
426 | attn_masks=None,
427 | query_key_padding_mask=None,
428 | key_padding_mask=None,
429 | ):
430 | """Forward function for `TransformerCoder`.
431 | Returns:
432 | Tensor: forwarded results with shape [num_query, bs, embed_dims].
433 | """
434 | x = super(PETRTransformerDecoderLayer, self).forward(
435 | query,
436 | key=key,
437 | value=value,
438 | query_pos=query_pos,
439 | key_pos=key_pos,
440 | attn_masks=attn_masks,
441 | query_key_padding_mask=query_key_padding_mask,
442 | key_padding_mask=key_padding_mask,
443 | )
444 |
445 | return x
446 |
447 | def forward(self,
448 | query,
449 | key=None,
450 | value=None,
451 | query_pos=None,
452 | key_pos=None,
453 | attn_masks=None,
454 | query_key_padding_mask=None,
455 | key_padding_mask=None,
456 | **kwargs
457 | ):
458 | """Forward function for `TransformerCoder`.
459 | Returns:
460 | Tensor: forwarded results with shape [num_query, bs, embed_dims].
461 | """
462 |
463 | if self.use_checkpoint and self.training:
464 | x = cp.checkpoint(
465 | self._forward,
466 | query,
467 | key,
468 | value,
469 | query_pos,
470 | key_pos,
471 | attn_masks,
472 | query_key_padding_mask,
473 | key_padding_mask,
474 | )
475 | else:
476 | x = self._forward(
477 | query,
478 | key=key,
479 | value=value,
480 | query_pos=query_pos,
481 | key_pos=key_pos,
482 | attn_masks=attn_masks,
483 | query_key_padding_mask=query_key_padding_mask,
484 | key_padding_mask=key_padding_mask
485 | )
486 |
487 | return x
488 |
--------------------------------------------------------------------------------
/projects/mmdet3d_plugin/models/dense_heads/fstr_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import math
3 | import copy
4 | import torch
5 | import torch.nn as nn
6 | from mmcv.cnn import build_conv_layer
7 | from mmcv.runner import BaseModule, force_fp32
8 | from mmdet.core import (build_assigner, build_sampler, multi_apply,
9 | reduce_mean, build_bbox_coder)
10 | from mmdet.models.utils import build_transformer
11 | from mmdet.models import HEADS, build_loss
12 | from mmdet.models.utils.transformer import inverse_sigmoid
13 | from mmdet3d.models.utils.clip_sigmoid import clip_sigmoid
14 | from mmdet3d.models import builder
15 | from einops import rearrange
16 | import collections
17 |
18 | from functools import reduce
19 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
20 | from mmdet3d.ops import make_sparse_convmodule
21 | import spconv.pytorch as spconv
22 | from mmcv.cnn import build_conv_layer
23 | import copy
24 | from spconv.core import ConvAlgo
25 |
26 | def pos2embed(pos, num_pos_feats=128, temperature=10000):
27 | scale = 2 * math.pi
28 | pos = pos * scale
29 | dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
30 | dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
31 | pos_x = pos[..., 0, None] / dim_t
32 | pos_y = pos[..., 1, None] / dim_t
33 | # pos_z = pos[..., 2, None] / dim_t
34 | pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
35 | pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
36 | # pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2)
37 | posemb = torch.cat((pos_y, pos_x), dim=-1)
38 | return posemb
39 |
40 |
41 | class LayerNormFunction(torch.autograd.Function):
42 |
43 | @staticmethod
44 | def forward(ctx, x, weight, bias, groups, eps):
45 | ctx.groups = groups
46 | ctx.eps = eps
47 | N, C, L = x.size()
48 | x = x.view(N, groups, C // groups, L)
49 | mu = x.mean(2, keepdim=True)
50 | var = (x - mu).pow(2).mean(2, keepdim=True)
51 | y = (x - mu) / (var + eps).sqrt()
52 | ctx.save_for_backward(y, var, weight)
53 | y = weight.view(1, C, 1) * y.view(N, C, L) + bias.view(1, C, 1)
54 | return y
55 |
56 | @staticmethod
57 | def backward(ctx, grad_output):
58 | groups = ctx.groups
59 | eps = ctx.eps
60 |
61 | N, C, L = grad_output.size()
62 | y, var, weight = ctx.saved_variables
63 | g = grad_output * weight.view(1, C, 1)
64 | g = g.view(N, groups, C//groups, L)
65 | mean_g = g.mean(dim=2, keepdim=True)
66 | mean_gy = (g * y).mean(dim=2, keepdim=True)
67 | gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
68 | return gx.view(N, C, L), (grad_output * y.view(N, C, L)).sum(dim=2).sum(dim=0), grad_output.sum(dim=2).sum(
69 | dim=0), None, None
70 |
71 |
72 | class GroupLayerNorm1d(nn.Module):
73 |
74 | def __init__(self, channels, groups=1, eps=1e-6):
75 | super(GroupLayerNorm1d, self).__init__()
76 | self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
77 | self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
78 | self.groups = groups
79 | self.eps = eps
80 |
81 | def forward(self, x):
82 | return LayerNormFunction.apply(x, self.weight, self.bias, self.groups, self.eps)
83 |
84 |
85 | @HEADS.register_module()
86 | class SeparateTaskHead(BaseModule):
87 | """SeparateHead for CenterHead.
88 |
89 | Args:
90 | in_channels (int): Input channels for conv_layer.
91 | heads (dict): Conv information.
92 | head_conv (int): Output channels.
93 | Default: 64.
94 | final_kernal (int): Kernal size for the last conv layer.
95 | Deafult: 1.
96 | init_bias (float): Initial bias. Default: -2.19.
97 | conv_cfg (dict): Config of conv layer.
98 | Default: dict(type='Conv2d')
99 | norm_cfg (dict): Config of norm layer.
100 | Default: dict(type='BN2d').
101 | bias (str): Type of bias. Default: 'auto'.
102 | """
103 |
104 | def __init__(self,
105 | in_channels,
106 | heads,
107 | groups=1,
108 | head_conv=64,
109 | final_kernel=1,
110 | init_bias=-2.19,
111 | init_cfg=None,
112 | **kwargs):
113 | assert init_cfg is None, 'To prevent abnormal initialization ' \
114 | 'behavior, init_cfg is not allowed to be set'
115 | super(SeparateTaskHead, self).__init__(init_cfg=init_cfg)
116 | self.heads = heads
117 | self.groups = groups
118 | self.init_bias = init_bias
119 | for head in self.heads:
120 | classes, num_conv = self.heads[head]
121 |
122 | conv_layers = []
123 | c_in = in_channels
124 | for i in range(num_conv - 1):
125 | conv_layers.extend([
126 | nn.Conv1d(
127 | c_in * groups,
128 | head_conv * groups,
129 | kernel_size=final_kernel,
130 | stride=1,
131 | padding=final_kernel // 2,
132 | groups=groups,
133 | bias=False),
134 | GroupLayerNorm1d(head_conv * groups, groups=groups),
135 | nn.ReLU(inplace=True)
136 | ])
137 | c_in = head_conv
138 |
139 | conv_layers.append(
140 | nn.Conv1d(
141 | head_conv * groups,
142 | classes * groups,
143 | kernel_size=final_kernel,
144 | stride=1,
145 | padding=final_kernel // 2,
146 | groups=groups,
147 | bias=True))
148 | conv_layers = nn.Sequential(*conv_layers)
149 |
150 | self.__setattr__(head, conv_layers)
151 |
152 | if init_cfg is None:
153 | self.init_cfg = dict(type='Kaiming', layer='Conv1d')
154 |
155 | def init_weights(self):
156 | """Initialize weights."""
157 | super().init_weights()
158 | for head in self.heads:
159 | if head == 'cls_logits':
160 | self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
161 |
162 | def forward(self, x):
163 | """Forward function for SepHead.
164 |
165 | Args:
166 | x (torch.Tensor): Input feature map with the shape of
167 | [N, B, query, C].
168 |
169 | Returns:
170 | dict[str: torch.Tensor]: contains the following keys:
171 |
172 | -reg (torch.Tensor): 2D regression value with the \
173 | shape of [N, B, query, 2].
174 | -height (torch.Tensor): Height value with the \
175 | shape of [N, B, query, 1].
176 | -dim (torch.Tensor): Size value with the shape \
177 | of [N, B, query, 3].
178 | -rot (torch.Tensor): Rotation value with the \
179 | shape of [N, B, query, 2].
180 | -vel (torch.Tensor): Velocity value with the \
181 | shape of [N, B, query, 2].
182 | """
183 | N, B, query_num, c1 = x.shape
184 | x = rearrange(x, "n b q c -> b (n c) q")
185 | ret_dict = dict()
186 |
187 | for head in self.heads:
188 | head_output = self.__getattr__(head)(x)
189 | ret_dict[head] = rearrange(head_output, "b (n c) q -> n b q c", n=N)
190 |
191 | return ret_dict
192 |
193 |
194 |
195 | @HEADS.register_module()
196 | class FSTRHead(BaseModule):
197 | "only init lidar proposal query"
198 | def __init__(self,
199 | in_channels,
200 | num_init_query = 200,
201 | num_query=900,
202 | max_sparse_token_per_sample = 10000,
203 | proposal_head_kernel = 3,
204 | hidden_dim=128,
205 | norm_bbox=True,
206 | downsample_scale=8,
207 | scalar=10,
208 | noise_scale=1.0,
209 | noise_trans=0.0,
210 | dn_weight=1.0,
211 | split=0.75,
212 | depth_num=64,
213 | nms_kernel_size=3,
214 | init_dn_query=False,
215 | init_learnable_query = False,
216 | init_query_topk = 1,
217 | init_query_radius = 1,
218 | gauusian_dn_sampling=False,
219 | noise_mean = 0.5,
220 | noise_std = 0.125,
221 | train_cfg=None,
222 | test_cfg=None,
223 | common_heads=dict(
224 | center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)
225 | ),
226 | tasks=[
227 | dict(num_class=1, class_names=['car']),
228 | dict(num_class=2, class_names=['truck', 'construction_vehicle']),
229 | dict(num_class=2, class_names=['bus', 'trailer']),
230 | dict(num_class=1, class_names=['barrier']),
231 | dict(num_class=2, class_names=['motorcycle', 'bicycle']),
232 | dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
233 | ],
234 | transformer=None,
235 | bbox_coder=None,
236 | loss_cls=dict(
237 | type="FocalLoss",
238 | use_sigmoid=True,
239 | reduction="mean",
240 | gamma=2, alpha=0.25, loss_weight=1.0
241 | ),
242 | loss_bbox=dict(
243 | type="L1Loss",
244 | reduction="mean",
245 | loss_weight=0.25,
246 | ),
247 | loss_heatmap=dict(
248 | type="GuassianFocalLoss",
249 | reduction="mean"
250 | ),
251 | separate_head=dict(
252 | type='SeparateMlpHead', init_bias=-2.19, final_kernel=3),
253 | init_cfg=None,
254 | **kwargs):
255 | super(FSTRHead, self).__init__(**kwargs)
256 |
257 |
258 | self.num_classes = [len(t["class_names"]) for t in tasks]
259 | self.class_names = [t["class_names"] for t in tasks]
260 | self.hidden_dim = hidden_dim
261 | self.train_cfg = train_cfg
262 | self.test_cfg = test_cfg
263 | self.num_query = num_query
264 | self.in_channels = in_channels
265 | self.norm_bbox = norm_bbox
266 | self.downsample_scale = downsample_scale
267 | self.scalar = scalar
268 | self.bbox_noise_scale = noise_scale
269 | self.bbox_noise_trans = noise_trans
270 | self.dn_weight = dn_weight
271 | self.split = split
272 | self.depth_num = depth_num
273 | self.nms_kernel_size = nms_kernel_size
274 | self.num_proposals = num_query
275 | self.loss_cls = build_loss(loss_cls)
276 | self.loss_bbox = build_loss(loss_bbox)
277 | self.loss_heatmap = build_loss(loss_heatmap)
278 | self.bbox_coder = build_bbox_coder(bbox_coder)
279 | self.pc_range = self.bbox_coder.pc_range
280 | self.fp16_enabled = False
281 | self.init_dn_query = init_dn_query
282 | self.init_learnable_query = init_learnable_query
283 | self.gauusian_dn_sampling = gauusian_dn_sampling
284 | self.noise_mean = noise_mean
285 | self.noise_std = noise_std
286 | self.init_query_topk = init_query_topk
287 | self.init_query_radius = init_query_radius
288 |
289 | # transformer
290 | self.transformer = build_transformer(transformer)
291 | # self.reference_points = nn.Embedding(num_query, 3)
292 | self.bev_embedding = nn.Sequential(
293 | nn.Linear(hidden_dim * 2, hidden_dim),
294 | nn.ReLU(inplace=True),
295 | nn.Linear(hidden_dim, hidden_dim)
296 | )
297 |
298 | # task head
299 | self.task_heads = nn.ModuleList()
300 | for num_cls in self.num_classes:
301 | heads = copy.deepcopy(common_heads)
302 | heads.update(dict(cls_logits=(num_cls, 2)))
303 | separate_head.update(
304 | in_channels=hidden_dim,
305 | heads=heads, num_cls=num_cls,
306 | groups=transformer.decoder.num_layers
307 | )
308 | self.task_heads.append(builder.build_head(separate_head))
309 |
310 | # assigner
311 | if train_cfg:
312 | self.assigner = build_assigner(train_cfg["assigner"])
313 | sampler_cfg = dict(type='PseudoSampler')
314 | self.sampler = build_sampler(sampler_cfg, context=self)
315 |
316 |
317 | self.num_init_query = num_init_query
318 | assert self.num_init_query < self.num_query, "number of init query must less than number of query"
319 | self.reference_points = nn.Embedding(self.num_query - self.num_init_query, 3)
320 | self.class_encoding = nn.Sequential()
321 | self.shared_conv = make_sparse_convmodule(
322 | self.in_channels,
323 | self.hidden_dim,
324 | (3,3),
325 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
326 | padding=(1,1),
327 | indice_key='head_spconv_1',
328 | conv_type='SubMConv2d',
329 | order=('conv', 'norm', 'act'))
330 | self.sparse_maxpool_2d = spconv.SparseMaxPool2d(3, 1, 1, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3')
331 | self.sparse_maxpool_2d_small = spconv.SparseMaxPool2d(1, 1, 0, subm=True, algo=ConvAlgo.Native, indice_key='max_pool_head_3')
332 | self.max_sparse_token_per_sample = max_sparse_token_per_sample
333 |
334 | # for sparse heatmap
335 | self.proposal_head_kernel = proposal_head_kernel
336 | output_channels = sum(self.num_classes)
337 | num_conv = 2
338 | self.heatmap_head = nn.Sequential()
339 | fc_list = []
340 | for k in range(num_conv - 1):
341 | fc_list.append(
342 | make_sparse_convmodule(
343 | self.hidden_dim,
344 | self.hidden_dim,
345 | self.proposal_head_kernel,
346 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
347 | padding=int(self.proposal_head_kernel//2),
348 | indice_key='head_spconv_1',
349 | conv_type='SubMConv2d',
350 | order=('conv', 'norm', 'act')),
351 | )
352 | fc_list.append(build_conv_layer(
353 | dict(type='SubMConv2d', indice_key='hm_out'),
354 | self.hidden_dim,
355 | sum(self.num_classes),
356 | 1,
357 | stride=1,
358 | padding=0,
359 | bias=True))
360 |
361 |
362 | self.sparse_hm_layer = nn.Sequential(*fc_list)
363 | self.sparse_hm_layer[-1].bias.data.fill_(-2.19)
364 |
365 | @property
366 | def coords_bev(self):
367 | cfg = self.train_cfg if self.train_cfg else self.test_cfg
368 | x_size, y_size = (
369 | cfg['grid_size'][1] // self.downsample_scale,
370 | cfg['grid_size'][0] // self.downsample_scale
371 | )
372 | meshgrid = [[0, y_size - 1, y_size], [0, x_size - 1, x_size]]
373 | batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
374 | batch_x = (batch_x + 0.5) / x_size
375 | batch_y = (batch_y + 0.5) / y_size
376 | coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)
377 | coord_base = coord_base.view(2, -1).transpose(1, 0) # (H*W, 2)
378 | return coord_base
379 | def init_weights(self):
380 | super(FSTRHead, self).init_weights()
381 | nn.init.uniform_(self.reference_points.weight.data, 0, 1)
382 |
383 | def _bev_query_embed(self, ref_points, img_metas):
384 | bev_embeds = self.bev_embedding(pos2embed(ref_points, num_pos_feats=self.hidden_dim))
385 | return bev_embeds
386 | def forward(self, points_feats, img_metas=None):
387 | """
388 | list([bs, c, h, w])
389 | """
390 | img_metas = [img_metas]
391 | return multi_apply(self.forward_single, points_feats, img_metas)
392 |
393 | def forward_single(self, x, img_metas):
394 | """
395 | x: [bs c h w]
396 | return List(dict(head_name: [num_dec x bs x num_query * head_dim]) ) x task_num
397 | """
398 | ret_dicts = []
399 | batch_size = len(img_metas)
400 | x = self.shared_conv(x)
401 | x_feature = torch.zeros(*(x.features.shape),device = x.features.device)
402 | x_feature[:,:] = x.features
403 | x_batch_indices = torch.zeros(x.indices.shape[0],1,device = x.features.device)
404 | x_ind = torch.zeros(x.indices.shape[0],2,device = x.features.device)
405 | x_2dpos = torch.zeros(x.indices.shape[0],2,device = x.features.device)
406 | x_batch_indices[:,:] = x.indices[:,:1]
407 | x_ind[:,:] = x.indices[:,-2:]
408 | x_ind = x_ind.to(torch.float32)
409 | cfg = self.train_cfg if self.train_cfg else self.test_cfg
410 | y_size, x_size = x.spatial_shape
411 | x_2dpos[:,0] = (x_ind[:,1] + 0.5) / x_size
412 | x_2dpos[:,1] = (x_ind[:,0] + 0.5) / y_size
413 | batch_size = int(x.batch_size)
414 |
415 | sparse_hm = self.sparse_hm_layer(x)
416 | sparse_hm_clone = spconv.SparseConvTensor(
417 | features=sparse_hm.features.clone().detach().sigmoid(),
418 | indices=sparse_hm.indices.clone(),
419 | spatial_shape=sparse_hm.spatial_shape,
420 | batch_size=sparse_hm.batch_size
421 | )
422 | x_hm_max = self.sparse_maxpool_2d(sparse_hm_clone, True)
423 | x_hm_max_small = self.sparse_maxpool_2d_small(sparse_hm_clone, True)
424 |
425 |
426 | selected = (x_hm_max.features == sparse_hm_clone.features)
427 | selected_small = (x_hm_max_small.features == sparse_hm_clone.features)
428 | selected[:,8] = selected_small[:,8]
429 | selected[:,9] = selected_small[:,9]
430 |
431 | score = sparse_hm_clone.features * selected
432 | score, _ = score.topk(1,dim=1)
433 | proposal_list = []
434 | proposal_feature = []
435 | # topk for each sample in batch
436 | for i in range(batch_size):
437 | mask = (x_batch_indices == i).squeeze(-1)
438 | sample_voxel_pos = x_2dpos[mask]
439 | sample_voxel_hm = score[mask]
440 | sample_voxel_feature = x_feature[mask]
441 | _, proposal_ind = sample_voxel_hm.topk(self.num_init_query,dim=0)
442 | proposal_list.append(sample_voxel_pos.gather(0, proposal_ind.repeat(1,2))[None,...])
443 | proposal_feature.append(sample_voxel_feature.gather(0, proposal_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...])
444 | query_pos = torch.cat(proposal_list,dim=0)
445 | query_init_feature = torch.cat(proposal_feature,dim=0)
446 |
447 | reference_points = self.reference_points.weight
448 | reference_points = reference_points.unsqueeze(0).repeat(batch_size,1,1)
449 |
450 | init_reference_points = torch.cat([query_pos,0.5*torch.ones([*query_pos.shape[:-1],1]).to(query_pos.device)],dim=-1)
451 |
452 | reference_points = torch.cat([init_reference_points, reference_points],dim=1)
453 |
454 | reference_points, attn_mask, mask_dict = self.prepare_for_dn(batch_size, reference_points, img_metas)
455 |
456 | pad_size = mask_dict['pad_size'] if mask_dict is not None else 0
457 |
458 | target = self.get_sparse_init_query(reference_points, x_feature , x_2dpos, x_batch_indices, pad_size)
459 |
460 | bev_pos_embeds = self.bev_embedding(pos2embed(x_2dpos, num_pos_feats=self.hidden_dim))
461 |
462 | bev_query_embeds = self.query_embed(reference_points, img_metas)
463 | query_embeds = bev_query_embeds
464 |
465 |
466 | # pad or drop
467 |
468 | batch_feature = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device)
469 | batch_bevemb = torch.zeros(batch_size,self.max_sparse_token_per_sample,self.hidden_dim,device = x.features.device)
470 |
471 | for i in range(batch_size):
472 | sample_token_num = (x_batch_indices==i).sum()
473 | batch_token_num = min(sample_token_num,self.max_sparse_token_per_sample)
474 | mask = (x_batch_indices == i).squeeze(-1)
475 | sample_voxel_hm = score[mask]
476 | sample_voxel_feature = x_feature[mask]
477 | sample_voxel_bev_emb = bev_pos_embeds[mask]
478 | _, voxel_ind = sample_voxel_hm.topk(batch_token_num,dim=0)
479 | # a = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1]))[None,...]
480 | batch_feature[i][:batch_token_num] = sample_voxel_feature.gather(0, voxel_ind.repeat(1,sample_voxel_feature.shape[1]))
481 | batch_bevemb[i][:batch_token_num] = sample_voxel_bev_emb.gather(0, voxel_ind.repeat(1,sample_voxel_bev_emb.shape[1]))
482 |
483 | outs_dec, _ = self.transformer(
484 | batch_feature, query_embeds,
485 | batch_bevemb,
486 | attn_masks=attn_mask,
487 | target = target
488 | )
489 | outs_dec = torch.nan_to_num(outs_dec)
490 |
491 | reference = inverse_sigmoid(reference_points.clone())
492 |
493 | flag = 0
494 | for task_id, task in enumerate(self.task_heads, 0):
495 | outs = task(outs_dec)
496 | center = (outs['center'] + reference[None, :, :, :2]).sigmoid()
497 | height = (outs['height'] + reference[None, :, :, 2:3]).sigmoid()
498 | _center, _height = center.new_zeros(center.shape), height.new_zeros(height.shape)
499 | _center[..., 0:1] = center[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]
500 | _center[..., 1:2] = center[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]
501 | _height[..., 0:1] = height[..., 0:1] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]
502 | outs['center'] = _center
503 | outs['height'] = _height
504 |
505 | if mask_dict and mask_dict['pad_size'] > 0:
506 | task_mask_dict = copy.deepcopy(mask_dict)
507 | class_name = self.class_names[task_id]
508 |
509 | known_lbs_bboxes_label = task_mask_dict['known_lbs_bboxes'][0]
510 | known_labels_raw = task_mask_dict['known_labels_raw']
511 | new_lbs_bboxes_label = known_lbs_bboxes_label.new_zeros(known_lbs_bboxes_label.shape)
512 | new_lbs_bboxes_label[:] = len(class_name)
513 | new_labels_raw = known_labels_raw.new_zeros(known_labels_raw.shape)
514 | new_labels_raw[:] = len(class_name)
515 | task_masks = [
516 | torch.where(known_lbs_bboxes_label == class_name.index(i) + flag)
517 | for i in class_name
518 | ]
519 | task_masks_raw = [
520 | torch.where(known_labels_raw == class_name.index(i) + flag)
521 | for i in class_name
522 | ]
523 | for cname, task_mask, task_mask_raw in zip(class_name, task_masks, task_masks_raw):
524 | new_lbs_bboxes_label[task_mask] = class_name.index(cname)
525 | new_labels_raw[task_mask_raw] = class_name.index(cname)
526 | task_mask_dict['known_lbs_bboxes'] = (new_lbs_bboxes_label, task_mask_dict['known_lbs_bboxes'][1])
527 | task_mask_dict['known_labels_raw'] = new_labels_raw
528 | flag += len(class_name)
529 |
530 | for key in list(outs.keys()):
531 | outs['dn_' + key] = outs[key][:, :, :mask_dict['pad_size'], :]
532 | outs[key] = outs[key][:, :, mask_dict['pad_size']:, :]
533 | outs['dn_mask_dict'] = task_mask_dict
534 |
535 | ret_dicts.append(outs)
536 | ret_dicts[0]['sparse_heatmap'] = sparse_hm
537 | return ret_dicts
538 |
539 |
540 | def get_sparse_init_query(self, ref_points, x_feature, x_2dpos , x_batch_indices, pad_size):
541 |
542 | total_range = self.pc_range[3]-self.pc_range[0]
543 | radius = self.init_query_radius
544 | diameter = (2 * radius + 1)/total_range
545 | sigma = diameter / 6
546 | # masked_gaussian = torch.exp(- distances / (2 * sigma * sigma))
547 | query_feature_list = []
548 | batch_size = ref_points.shape[0]
549 |
550 | for bs in range(batch_size):
551 | sample_q = ref_points[bs][:,:2]
552 | sample_mask = x_batch_indices[:,0] == bs
553 | sample_token = x_feature[sample_mask]
554 | sample_pos = x_2dpos[sample_mask]
555 | with torch.no_grad():
556 | dis_mat = sample_q.unsqueeze(1) - sample_pos.unsqueeze(0)
557 | dis_mat = -(dis_mat ** 2).sum(-1)
558 | nearest_dis_topk,nearest_order_topk = dis_mat.topk(self.init_query_topk ,dim=1,sorted= True)
559 | gaussian_weight = torch.exp( nearest_dis_topk / (2 * sigma * sigma))
560 | gaussian_weight_sum = torch.clip(gaussian_weight.sum(-1),1)
561 |
562 | flatten_order = nearest_order_topk.view(-1,self.init_query_topk)
563 | flatten_weight = (gaussian_weight/gaussian_weight_sum.unsqueeze(1)).view(-1,self.init_query_topk)
564 | feature = (sample_token.gather(0, flatten_order.repeat(1,sample_token.shape[1]))*flatten_weight).view(-1,self.init_query_topk,sample_token.shape[1]).sum(1).unsqueeze(0)
565 | query_feature_list.append(feature)
566 |
567 | query_feature = torch.cat(query_feature_list,dim=0)
568 | if not self.init_dn_query:
569 | query_feature[:,:pad_size,:] *=0
570 | if not self.init_learnable_query:
571 | query_feature[:,pad_size+self.num_init_query:,:] *=0
572 | query_feature = query_feature.permute(1,0,2)
573 |
574 |
575 | return query_feature
576 |
577 |
578 | def prepare_for_dn(self, batch_size, reference_points, img_metas):
579 | if self.training:
580 | targets = [torch.cat((img_meta['gt_bboxes_3d']._data.gravity_center, img_meta['gt_bboxes_3d']._data.tensor[:, 3:]),dim=1) for img_meta in img_metas ]
581 | labels = [img_meta['gt_labels_3d']._data for img_meta in img_metas ]
582 |
583 | known = [(torch.ones_like(t)).cuda() for t in labels]
584 | know_idx = known
585 | unmask_bbox = unmask_label = torch.cat(known)
586 | known_num = [t.size(0) for t in targets]
587 | labels = torch.cat([t for t in labels])
588 | boxes = torch.cat([t for t in targets])
589 | batch_idx = torch.cat([torch.full((t.size(0),), i) for i, t in enumerate(targets)])
590 |
591 | known_indice = torch.nonzero(unmask_label + unmask_bbox)
592 | known_indice = known_indice.view(-1)
593 | # add noise
594 | groups = min(self.scalar, self.num_query // max(known_num))
595 | known_indice = known_indice.repeat(groups, 1).view(-1)
596 | known_labels = labels.repeat(groups, 1).view(-1).long().to(reference_points.device)
597 | known_labels_raw = labels.repeat(groups, 1).view(-1).long().to(reference_points.device)
598 | known_bid = batch_idx.repeat(groups, 1).view(-1)
599 | known_bboxs = boxes.repeat(groups, 1).to(reference_points.device)
600 | known_bbox_center = known_bboxs[:, :3].clone()
601 | known_bbox_scale = known_bboxs[:, 3:6].clone()
602 |
603 | # known_one_hot = F.one_hot(known_labels, self.num_classes[0]).permute(1,0)
604 | # known_query_cat_encoding = self.class_encoding(known_one_hot.float().unsqueeze(0))
605 | if self.bbox_noise_scale > 0:
606 | diff = known_bbox_scale / 2 + self.bbox_noise_trans
607 | if self.gauusian_dn_sampling:
608 | rand_prob = torch.randn_like(known_bbox_center)*self.noise_std + self.noise_mean
609 | rand_pn = torch.rand_like(known_bbox_center)
610 | p_mask = rand_pn>0.5
611 | n_mask = rand_pn<=0.5
612 | rand_prob[n_mask] *= -1
613 | else:
614 | rand_prob = torch.rand_like(known_bbox_center) * 2 - 1.0
615 | known_bbox_center += torch.mul(rand_prob, diff) * self.bbox_noise_scale
616 | known_bbox_center[..., 0:1] = (known_bbox_center[..., 0:1] - self.pc_range[0]) / (
617 | self.pc_range[3] - self.pc_range[0]
618 | )
619 | known_bbox_center[..., 1:2] = (known_bbox_center[..., 1:2] - self.pc_range[1]) / (
620 | self.pc_range[4] - self.pc_range[1]
621 | )
622 | known_bbox_center[..., 2:3] = (known_bbox_center[..., 2:3] - self.pc_range[2]) / (
623 | self.pc_range[5] - self.pc_range[2]
624 | )
625 | known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0)
626 | mask = torch.norm(rand_prob, 2, 1) > self.split
627 | known_labels[mask] = sum(self.num_classes)
628 |
629 | single_pad = int(max(known_num))
630 | pad_size = int(single_pad * groups)
631 | padding_bbox = torch.zeros(batch_size,pad_size, 3).to(reference_points.device)
632 | # padding_cls_encoding = torch.zeros(batch_size,query_cat_encoding.shape[1],pad_size).to(reference_points.device)
633 | padded_reference_points = torch.cat([padding_bbox, reference_points], dim=1)
634 | # padding_query_cat_encoding = torch.cat([padding_cls_encoding, query_cat_encoding], dim=-1)
635 | # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1)
636 | # known_query_cat_encoding = known_query_cat_encoding.permute(0,2,1)
637 |
638 | if len(known_num):
639 | map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3]
640 | map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(groups)]).long()
641 | if len(known_bid):
642 | padded_reference_points[(known_bid.long(), map_known_indice)] = known_bbox_center.to(reference_points.device)
643 | # padding_query_cat_encoding[(known_bid.long(), map_known_indice)] = known_query_cat_encoding
644 |
645 | # padding_query_cat_encoding = padding_query_cat_encoding.permute(0,2,1)
646 | tgt_size = pad_size + self.num_query
647 | attn_mask = torch.ones(tgt_size, tgt_size).to(reference_points.device) < 0
648 | # match query cannot see the reconstruct
649 | attn_mask[pad_size:, :pad_size] = True
650 | # reconstruct cannot see each other
651 | for i in range(groups):
652 | if i == 0:
653 | attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True
654 | if i == groups - 1:
655 | attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True
656 | else:
657 | attn_mask[single_pad * i : single_pad * (i + 1), single_pad * (i + 1) : pad_size] = True
658 | attn_mask[single_pad * i : single_pad * (i + 1), : single_pad * i] = True
659 |
660 | mask_dict = {
661 | "known_indice": torch.as_tensor(known_indice).long(),
662 | "batch_idx": torch.as_tensor(batch_idx).long(),
663 | "map_known_indice": torch.as_tensor(map_known_indice).long(),
664 | "known_lbs_bboxes": (known_labels, known_bboxs),
665 | "known_labels_raw": known_labels_raw,
666 | "know_idx": know_idx,
667 | "pad_size": pad_size,
668 | }
669 |
670 | else:
671 | padded_reference_points = reference_points
672 | attn_mask = None
673 | mask_dict = None
674 | # padding_query_cat_encoding = query_cat_encoding
675 |
676 | return padded_reference_points, attn_mask, mask_dict
677 |
678 | @force_fp32(apply_to=('preds_dicts'))
679 | def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
680 | """"Loss function.
681 | Args:
682 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
683 | gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
684 | preds_dicts(tuple[list[dict]]): nb_tasks x num_lvl
685 | center: (num_dec, batch_size, num_query, 2)
686 | height: (num_dec, batch_size, num_query, 1)
687 | dim: (num_dec, batch_size, num_query, 3)
688 | rot: (num_dec, batch_size, num_query, 2)
689 | vel: (num_dec, batch_size, num_query, 2)
690 | cls_logits: (num_dec, batch_size, num_query, task_classes)
691 | Returns:
692 | dict[str, Tensor]: A dictionary of loss components.
693 | """
694 | num_decoder = preds_dicts[0][0]['center'].shape[0]
695 | all_pred_bboxes, all_pred_logits = collections.defaultdict(list), collections.defaultdict(list)
696 |
697 | for task_id, preds_dict in enumerate(preds_dicts, 0):
698 | for dec_id in range(num_decoder):
699 | pred_bbox = torch.cat(
700 | (preds_dict[0]['center'][dec_id], preds_dict[0]['height'][dec_id],
701 | preds_dict[0]['dim'][dec_id], preds_dict[0]['rot'][dec_id],
702 | preds_dict[0]['vel'][dec_id]),
703 | dim=-1
704 | )
705 | all_pred_bboxes[dec_id].append(pred_bbox)
706 | all_pred_logits[dec_id].append(preds_dict[0]['cls_logits'][dec_id])
707 | all_pred_bboxes = [all_pred_bboxes[idx] for idx in range(num_decoder)]
708 | all_pred_logits = [all_pred_logits[idx] for idx in range(num_decoder)]
709 |
710 | loss_cls, loss_bbox = multi_apply(
711 | self.loss_single, all_pred_bboxes, all_pred_logits,
712 | [gt_bboxes_3d for _ in range(num_decoder)],
713 | [gt_labels_3d for _ in range(num_decoder)],
714 | )
715 |
716 | loss_dict = dict()
717 | loss_dict['loss_cls'] = loss_cls[-1]
718 | loss_dict['loss_bbox'] = loss_bbox[-1]
719 |
720 | num_dec_layer = 0
721 | for loss_cls_i, loss_bbox_i in zip(loss_cls[:-1],
722 | loss_bbox[:-1]):
723 | loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
724 | loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
725 | num_dec_layer += 1
726 |
727 | dn_pred_bboxes, dn_pred_logits = collections.defaultdict(list), collections.defaultdict(list)
728 | dn_mask_dicts = collections.defaultdict(list)
729 | for task_id, preds_dict in enumerate(preds_dicts, 0):
730 | for dec_id in range(num_decoder):
731 | pred_bbox = torch.cat(
732 | (preds_dict[0]['dn_center'][dec_id], preds_dict[0]['dn_height'][dec_id],
733 | preds_dict[0]['dn_dim'][dec_id], preds_dict[0]['dn_rot'][dec_id],
734 | preds_dict[0]['dn_vel'][dec_id]),
735 | dim=-1
736 | )
737 | dn_pred_bboxes[dec_id].append(pred_bbox)
738 | dn_pred_logits[dec_id].append(preds_dict[0]['dn_cls_logits'][dec_id])
739 | dn_mask_dicts[dec_id].append(preds_dict[0]['dn_mask_dict'])
740 | dn_pred_bboxes = [dn_pred_bboxes[idx] for idx in range(num_decoder)]
741 | dn_pred_logits = [dn_pred_logits[idx] for idx in range(num_decoder)]
742 | dn_mask_dicts = [dn_mask_dicts[idx] for idx in range(num_decoder)]
743 | dn_loss_cls, dn_loss_bbox = multi_apply(
744 | self.dn_loss_single, dn_pred_bboxes, dn_pred_logits, dn_mask_dicts
745 | )
746 |
747 | loss_dict['dn_loss_cls'] = dn_loss_cls[-1]
748 | loss_dict['dn_loss_bbox'] = dn_loss_bbox[-1]
749 | num_dec_layer = 0
750 | for loss_cls_i, loss_bbox_i in zip(dn_loss_cls[:-1],
751 | dn_loss_bbox[:-1]):
752 | loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
753 | loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
754 | num_dec_layer += 1
755 |
756 | sparse_hm_voxel = preds_dict[0]['sparse_heatmap']
757 | spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels = self._get_voxel_infos(sparse_hm_voxel)
758 | voxel_hp_target = multi_apply(
759 | self.sparse_hp_target_single,
760 | gt_bboxes_3d,
761 | gt_labels_3d,
762 | num_voxels,
763 | spatial_indices,
764 | )
765 | # voxel_hp_target = self.sparse_hp_target_single(sparse_hm_voxel, gt_bboxes_3d,gt_labels_3d)
766 | # TODO: Fix bugs for hp target (uncorrect when batchsize != 1)
767 | hp_target = [ t.permute(1,0) for t in voxel_hp_target[0]]
768 | hp_target = torch.cat(hp_target,dim=0)
769 | pred_hm = sparse_hm_voxel.features.clone()
770 | loss_heatmap = self.loss_heatmap(clip_sigmoid(pred_hm), hp_target, avg_factor=max(hp_target.eq(1).float().sum().item(), 1))
771 | # heatmap_target = torch.cat(hp_target, dim=0)
772 | # loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict[0]['dense_heatmap']), heatmap_target, avg_factor=max(heatmap_target.eq(1).float().sum().item(), 1))
773 | loss_dict['loss_heatmap'] = loss_heatmap
774 | return loss_dict
775 |
776 |
777 | def sparse_hp_target_single(self,gt_bboxes_3d, gt_labels_3d, num_voxels, spatial_indices):
778 | num_max_objs = 500
779 | gaussian_overlap = 0.1
780 | min_radius = 2
781 | device = gt_labels_3d.device
782 | gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device)
783 | grid_size = torch.tensor(self.train_cfg['grid_size'])
784 | pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
785 | voxel_size = torch.tensor(self.train_cfg['voxel_size'])
786 | feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len]
787 | # heatmap = gt_bboxes_3d.new_zeros((self.num_classes[0], feature_map_size[1], feature_map_size[0]))
788 |
789 |
790 | inds = gt_bboxes_3d.new_zeros(num_max_objs).long()
791 | mask = gt_bboxes_3d.new_zeros(num_max_objs).long()
792 | heatmap = gt_bboxes_3d.new_zeros(sum(self.num_classes), num_voxels)
793 | x, y, z = gt_bboxes_3d[:, 0], gt_bboxes_3d[:, 1], gt_bboxes_3d[:, 2]
794 |
795 | coord_x = (x - self.pc_range[0]) / voxel_size[0] / self.downsample_scale
796 | coord_y = (y - self.pc_range[1]) / voxel_size[1] / self.downsample_scale
797 |
798 | spatial_shape = [self.test_cfg['grid_size'][0] / self.downsample_scale, self.test_cfg['grid_size'][1] / self.downsample_scale]
799 | coord_x = torch.clamp(coord_x, min=0, max=spatial_shape[1] - 0.5) # bugfixed: 1e-6 does not work for center.int()
800 | coord_y = torch.clamp(coord_y, min=0, max=spatial_shape[0] - 0.5) #
801 |
802 | center = torch.cat((coord_y[:, None], coord_x[:, None]), dim=-1)
803 | center_int = center.int()
804 | center_int_float = center_int.float()
805 |
806 | dx, dy, dz = gt_bboxes_3d[:, 3], gt_bboxes_3d[:, 4], gt_bboxes_3d[:, 5]
807 | dx = dx / voxel_size[0] / self.downsample_scale
808 | dy = dy / voxel_size[1] / self.downsample_scale
809 |
810 | radius = self.gaussian_radius(dx, dy, min_overlap=gaussian_overlap)
811 | radius = torch.clamp_min(radius.int(), min=min_radius)
812 |
813 | for k in range(min(num_max_objs, gt_bboxes_3d.shape[0])):
814 | if dx[k] <= 0 or dy[k] <= 0:
815 | continue
816 |
817 | if not (0 <= center_int[k][0] <= spatial_shape[1] and 0 <= center_int[k][1] <= spatial_shape[0]):
818 | continue
819 |
820 | cur_class_id = (gt_labels_3d[k]).long()
821 | distance = self.distance(spatial_indices, center[k])
822 | inds[k] = distance.argmin()
823 | mask[k] = 1
824 |
825 | # gt_center
826 | self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], distance, radius[k].item() * 1)
827 |
828 | # nearnest
829 | self.draw_gaussian_to_heatmap_voxels(heatmap[cur_class_id], self.distance(spatial_indices, spatial_indices[inds[k]]), radius[k].item() * 1)
830 |
831 | return [heatmap]
832 |
833 | def draw_gaussian_to_heatmap_voxels(self, heatmap, distances, radius, k=1):
834 |
835 | diameter = 2 * radius + 1
836 | sigma = diameter / 6
837 | masked_gaussian = torch.exp(- distances / (2 * sigma * sigma))
838 |
839 | torch.max(heatmap, masked_gaussian, out=heatmap)
840 |
841 | return heatmap
842 |
843 | def distance(self, voxel_indices, center):
844 | distances = ((voxel_indices - center.unsqueeze(0))**2).sum(-1)
845 | return distances
846 |
847 |
848 | def _get_voxel_infos(self, x):
849 | spatial_shape = x.spatial_shape
850 | voxel_indices = x.indices
851 | spatial_indices = []
852 | num_voxels = []
853 | batch_size = x.batch_size
854 | batch_index = voxel_indices[:, 0]
855 |
856 | for bs_idx in range(batch_size):
857 | batch_inds = batch_index==bs_idx
858 | spatial_indices.append(voxel_indices[batch_inds][:, [1, 2]]) # y, x
859 | num_voxels.append(batch_inds.sum())
860 |
861 | return spatial_shape, batch_index, voxel_indices, spatial_indices, num_voxels
862 |
863 |
864 | def gaussian_radius(self, height, width, min_overlap=0.5):
865 | """
866 | Args:
867 | height: (N)
868 | width: (N)
869 | min_overlap:
870 | Returns:
871 | """
872 | a1 = 1
873 | b1 = (height + width)
874 | c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
875 | sq1 = (b1 ** 2 - 4 * a1 * c1).sqrt()
876 | r1 = (b1 + sq1) / 2
877 |
878 | a2 = 4
879 | b2 = 2 * (height + width)
880 | c2 = (1 - min_overlap) * width * height
881 | sq2 = (b2 ** 2 - 4 * a2 * c2).sqrt()
882 | r2 = (b2 + sq2) / 2
883 |
884 | a3 = 4 * min_overlap
885 | b3 = -2 * min_overlap * (height + width)
886 | c3 = (min_overlap - 1) * width * height
887 | sq3 = (b3 ** 2 - 4 * a3 * c3).sqrt()
888 | r3 = (b3 + sq3) / 2
889 |
890 | ret = torch.min(torch.min(r1, r2), r3)
891 | return ret
892 |
893 | def query_embed(self, ref_points, img_metas):
894 | ref_points = inverse_sigmoid(ref_points.clone()).sigmoid()
895 | bev_embeds = self._bev_query_embed(ref_points, img_metas)
896 | return bev_embeds
897 |
898 |
899 | def _get_targets_single(self, gt_bboxes_3d, gt_labels_3d, pred_bboxes, pred_logits):
900 | """"Compute regression and classification targets for one image.
901 | Outputs from a single decoder layer of a single feature level are used.
902 | Args:
903 |
904 | gt_bboxes_3d (Tensor): LiDARInstance3DBoxes(num_gts, 9)
905 | gt_labels_3d (Tensor): Ground truth class indices (num_gts, )
906 | pred_bboxes (list[Tensor]): num_tasks x (num_query, 10)
907 | pred_logits (list[Tensor]): num_tasks x (num_query, task_classes)
908 | Returns:
909 | tuple[Tensor]: a tuple containing the following.
910 | - labels_tasks (list[Tensor]): num_tasks x (num_query, ).
911 | - label_weights_tasks (list[Tensor]): num_tasks x (num_query, ).
912 | - bbox_targets_tasks (list[Tensor]): num_tasks x (num_query, 9).
913 | - bbox_weights_tasks (list[Tensor]): num_tasks x (num_query, 10).
914 | - pos_inds (list[Tensor]): num_tasks x Sampled positive indices.
915 | - neg_inds (Tensor): num_tasks x Sampled negative indices.
916 | """
917 | device = gt_labels_3d.device
918 | gt_bboxes_3d = torch.cat(
919 | (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1
920 | ).to(device)
921 |
922 | task_masks = []
923 | flag = 0
924 | for class_name in self.class_names:
925 | task_masks.append([
926 | torch.where(gt_labels_3d == class_name.index(i) + flag)
927 | for i in class_name
928 | ])
929 | flag += len(class_name)
930 |
931 | task_boxes = []
932 | task_classes = []
933 | flag2 = 0
934 | for idx, mask in enumerate(task_masks):
935 | task_box = []
936 | task_class = []
937 | for m in mask:
938 | task_box.append(gt_bboxes_3d[m])
939 | task_class.append(gt_labels_3d[m] - flag2)
940 | task_boxes.append(torch.cat(task_box, dim=0).to(device))
941 | task_classes.append(torch.cat(task_class).long().to(device))
942 | flag2 += len(mask)
943 |
944 | def task_assign(bbox_pred, logits_pred, gt_bboxes, gt_labels, num_classes):
945 | num_bboxes = bbox_pred.shape[0]
946 | assign_results = self.assigner.assign(bbox_pred, logits_pred, gt_bboxes, gt_labels)
947 | sampling_result = self.sampler.sample(assign_results, bbox_pred, gt_bboxes)
948 | pos_inds, neg_inds = sampling_result.pos_inds, sampling_result.neg_inds
949 | # label targets
950 | labels = gt_bboxes.new_full((num_bboxes, ),
951 | num_classes,
952 | dtype=torch.long)
953 | labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
954 | label_weights = gt_bboxes.new_ones(num_bboxes)
955 | # bbox_targets
956 | code_size = gt_bboxes.shape[1]
957 | bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size]
958 | bbox_weights = torch.zeros_like(bbox_pred)
959 | bbox_weights[pos_inds] = 1.0
960 |
961 | if len(sampling_result.pos_gt_bboxes) > 0:
962 | bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
963 | return labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds
964 |
965 | labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks\
966 | = multi_apply(task_assign, pred_bboxes, pred_logits, task_boxes, task_classes, self.num_classes)
967 |
968 | return labels_tasks, labels_weights_tasks, bbox_targets_tasks, bbox_weights_tasks, pos_inds_tasks, neg_inds_tasks
969 |
970 | def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits):
971 | """"Compute regression and classification targets for a batch image.
972 | Outputs from a single decoder layer of a single feature level are used.
973 | Args:
974 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
975 | gt_labels_3d (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
976 | pred_bboxes (list[list[Tensor]]): batch_size x num_task x [num_query, 10].
977 | pred_logits (list[list[Tensor]]): batch_size x num_task x [num_query, task_classes]
978 | Returns:
979 | tuple: a tuple containing the following targets.
980 | - task_labels_list (list(list[Tensor])): num_tasks x batch_size x (num_query, ).
981 | - task_labels_weight_list (list[Tensor]): num_tasks x batch_size x (num_query, )
982 | - task_bbox_targets_list (list[Tensor]): num_tasks x batch_size x (num_query, 9)
983 | - task_bbox_weights_list (list[Tensor]): num_tasks x batch_size x (num_query, 10)
984 | - num_total_pos_tasks (list[int]): num_tasks x Number of positive samples
985 | - num_total_neg_tasks (list[int]): num_tasks x Number of negative samples.
986 | """
987 | (labels_list, labels_weight_list, bbox_targets_list,
988 | bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
989 | self._get_targets_single, gt_bboxes_3d, gt_labels_3d, preds_bboxes, preds_logits
990 | )
991 | task_num = len(labels_list[0])
992 | num_total_pos_tasks, num_total_neg_tasks = [], []
993 | task_labels_list, task_labels_weight_list, task_bbox_targets_list, \
994 | task_bbox_weights_list = [], [], [], []
995 |
996 | for task_id in range(task_num):
997 | num_total_pos_task = sum((inds[task_id].numel() for inds in pos_inds_list))
998 | num_total_neg_task = sum((inds[task_id].numel() for inds in neg_inds_list))
999 | num_total_pos_tasks.append(num_total_pos_task)
1000 | num_total_neg_tasks.append(num_total_neg_task)
1001 | task_labels_list.append([labels_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1002 | task_labels_weight_list.append([labels_weight_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1003 | task_bbox_targets_list.append([bbox_targets_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1004 | task_bbox_weights_list.append([bbox_weights_list[batch_idx][task_id] for batch_idx in range(len(gt_bboxes_3d))])
1005 |
1006 | return (task_labels_list, task_labels_weight_list, task_bbox_targets_list,
1007 | task_bbox_weights_list, num_total_pos_tasks, num_total_neg_tasks)
1008 |
1009 | def _loss_single_task(self,
1010 | pred_bboxes,
1011 | pred_logits,
1012 | labels_list,
1013 | labels_weights_list,
1014 | bbox_targets_list,
1015 | bbox_weights_list,
1016 | num_total_pos,
1017 | num_total_neg):
1018 | """"Compute loss for single task.
1019 | Outputs from a single decoder layer of a single feature level are used.
1020 | Args:
1021 | pred_bboxes (Tensor): (batch_size, num_query, 10)
1022 | pred_logits (Tensor): (batch_size, num_query, task_classes)
1023 | labels_list (list[Tensor]): batch_size x (num_query, )
1024 | labels_weights_list (list[Tensor]): batch_size x (num_query, )
1025 | bbox_targets_list(list[Tensor]): batch_size x (num_query, 9)
1026 | bbox_weights_list(list[Tensor]): batch_size x (num_query, 10)
1027 | num_total_pos: int
1028 | num_total_neg: int
1029 | Returns:
1030 | loss_cls
1031 | loss_bbox
1032 | """
1033 | labels = torch.cat(labels_list, dim=0)
1034 | labels_weights = torch.cat(labels_weights_list, dim=0)
1035 | bbox_targets = torch.cat(bbox_targets_list, dim=0)
1036 | bbox_weights = torch.cat(bbox_weights_list, dim=0)
1037 |
1038 | pred_bboxes_flatten = pred_bboxes.flatten(0, 1)
1039 | pred_logits_flatten = pred_logits.flatten(0, 1)
1040 |
1041 | cls_avg_factor = num_total_pos * 1.0 + num_total_neg * 0.1
1042 | cls_avg_factor = max(cls_avg_factor, 1)
1043 | loss_cls = self.loss_cls(
1044 | pred_logits_flatten, labels, labels_weights, avg_factor=cls_avg_factor
1045 | )
1046 |
1047 | normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
1048 | isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
1049 | bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :]
1050 |
1051 | loss_bbox = self.loss_bbox(
1052 | pred_bboxes_flatten[isnotnan, :10],
1053 | normalized_bbox_targets[isnotnan, :10],
1054 | bbox_weights[isnotnan, :10],
1055 | avg_factor=num_total_pos
1056 | )
1057 |
1058 | loss_cls = torch.nan_to_num(loss_cls)
1059 | loss_bbox = torch.nan_to_num(loss_bbox)
1060 | return loss_cls, loss_bbox
1061 |
1062 | def loss_single(self,
1063 | pred_bboxes,
1064 | pred_logits,
1065 | gt_bboxes_3d,
1066 | gt_labels_3d):
1067 | """"Loss function for outputs from a single decoder layer of a single
1068 | feature level.
1069 | Args:
1070 | pred_bboxes (list[Tensor]): num_tasks x [bs, num_query, 10].
1071 | pred_logits (list(Tensor]): num_tasks x [bs, num_query, task_classes]
1072 | gt_bboxes_3d (list[LiDARInstance3DBoxes]): batch_size * (num_gts, 9)
1073 | gt_labels_list (list[Tensor]): Ground truth class indices. batch_size * (num_gts, )
1074 | Returns:
1075 | dict[str, Tensor]: A dictionary of loss components for outputs from
1076 | a single decoder layer.
1077 | """
1078 | batch_size = pred_bboxes[0].shape[0]
1079 | pred_bboxes_list, pred_logits_list = [], []
1080 | for idx in range(batch_size):
1081 | pred_bboxes_list.append([task_pred_bbox[idx] for task_pred_bbox in pred_bboxes])
1082 | pred_logits_list.append([task_pred_logits[idx] for task_pred_logits in pred_logits])
1083 | cls_reg_targets = self.get_targets(
1084 | gt_bboxes_3d, gt_labels_3d, pred_bboxes_list, pred_logits_list
1085 | )
1086 | (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
1087 | num_total_pos, num_total_neg) = cls_reg_targets
1088 | loss_cls_tasks, loss_bbox_tasks = multi_apply(
1089 | self._loss_single_task,
1090 | pred_bboxes,
1091 | pred_logits,
1092 | labels_list,
1093 | label_weights_list,
1094 | bbox_targets_list,
1095 | bbox_weights_list,
1096 | num_total_pos,
1097 | num_total_neg
1098 | )
1099 |
1100 |
1101 | return sum(loss_cls_tasks), sum(loss_bbox_tasks)
1102 |
1103 | def _dn_loss_single_task(self,
1104 | pred_bboxes,
1105 | pred_logits,
1106 | mask_dict):
1107 | known_labels, known_bboxs = mask_dict['known_lbs_bboxes']
1108 | map_known_indice = mask_dict['map_known_indice'].long()
1109 | known_indice = mask_dict['known_indice'].long()
1110 | batch_idx = mask_dict['batch_idx'].long()
1111 | bid = batch_idx[known_indice]
1112 | known_labels_raw = mask_dict['known_labels_raw']
1113 |
1114 | pred_logits = pred_logits[(bid, map_known_indice)]
1115 | pred_bboxes = pred_bboxes[(bid, map_known_indice)]
1116 | num_tgt = known_indice.numel()
1117 |
1118 | # filter task bbox
1119 | task_mask = known_labels_raw != pred_logits.shape[-1]
1120 | task_mask_sum = task_mask.sum()
1121 |
1122 | if task_mask_sum > 0:
1123 | # pred_logits = pred_logits[task_mask]
1124 | # known_labels = known_labels[task_mask]
1125 | pred_bboxes = pred_bboxes[task_mask]
1126 | known_bboxs = known_bboxs[task_mask]
1127 |
1128 | # classification loss
1129 | # construct weighted avg_factor to match with the official DETR repo
1130 | cls_avg_factor = num_tgt * 3.14159 / 6 * self.split * self.split * self.split
1131 | # if self.sync_cls_avg_factor:
1132 | # cls_avg_factor = reduce_mean(
1133 | # pred_logits.new_tensor([cls_avg_factor]))
1134 |
1135 | label_weights = torch.ones_like(known_labels)
1136 | cls_avg_factor = max(cls_avg_factor, 1)
1137 | loss_cls = self.loss_cls(
1138 | pred_logits, known_labels.long(), label_weights, avg_factor=cls_avg_factor)
1139 |
1140 | # Compute the average number of gt boxes accross all gpus, for
1141 | # normalization purposes
1142 | num_tgt = loss_cls.new_tensor([num_tgt])
1143 | num_tgt = torch.clamp(reduce_mean(num_tgt), min=1).item()
1144 |
1145 | # regression L1 loss
1146 | normalized_bbox_targets = normalize_bbox(known_bboxs, self.pc_range)
1147 | isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
1148 | bbox_weights = torch.ones_like(pred_bboxes)
1149 | bbox_weights = bbox_weights * bbox_weights.new_tensor(self.train_cfg.code_weights)[None, :]
1150 | # bbox_weights[:, 6:8] = 0
1151 | loss_bbox = self.loss_bbox(
1152 | pred_bboxes[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_tgt)
1153 |
1154 | loss_cls = torch.nan_to_num(loss_cls)
1155 | loss_bbox = torch.nan_to_num(loss_bbox)
1156 |
1157 | if task_mask_sum == 0:
1158 | # loss_cls = loss_cls * 0.0
1159 | loss_bbox = loss_bbox * 0.0
1160 |
1161 | return self.dn_weight * loss_cls, self.dn_weight * loss_bbox
1162 |
1163 | def dn_loss_single(self,
1164 | pred_bboxes,
1165 | pred_logits,
1166 | dn_mask_dict):
1167 | loss_cls_tasks, loss_bbox_tasks = multi_apply(
1168 | self._dn_loss_single_task, pred_bboxes, pred_logits, dn_mask_dict
1169 | )
1170 | return sum(loss_cls_tasks), sum(loss_bbox_tasks)
1171 |
1172 | @force_fp32(apply_to=('preds_dicts'))
1173 | def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
1174 | preds_dicts = self.bbox_coder.decode(preds_dicts)
1175 | num_samples = len(preds_dicts)
1176 |
1177 | ret_list = []
1178 | for i in range(num_samples):
1179 | preds = preds_dicts[i]
1180 | bboxes = preds['bboxes']
1181 | bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
1182 | bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1))
1183 | scores = preds['scores']
1184 | labels = preds['labels']
1185 | ret_list.append([bboxes, scores, labels])
1186 | return ret_list
--------------------------------------------------------------------------------