├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── ovuni3detr.png └── uni3detr.png ├── extra_tools ├── analysis_tools │ └── eval_metric.py ├── create_data.py ├── create_data.sh ├── data_converter │ ├── create_unified_gt_database.py │ └── nuscenes_converter.py ├── dist_test.sh ├── dist_train.sh ├── eval_metric.py ├── get_flops.py ├── test.py └── train.py ├── model-index.yml ├── projects ├── __init__.py ├── configs │ ├── ov_uni3detr │ │ ├── ov_uni3detr_sunrgbd_mm.py │ │ ├── ov_uni3detr_sunrgbd_pc.py │ │ └── ov_uni3detr_sunrgbd_rgb.py │ └── uni3detr │ │ ├── uni3detr_kitti_3classes.py │ │ ├── uni3detr_kitti_car.py │ │ ├── uni3detr_nuscenes.py │ │ ├── uni3detr_scannet.py │ │ ├── uni3detr_scannet_large.py │ │ └── uni3detr_sunrgbd.py └── mmdet3d_plugin │ ├── __init__.py │ ├── core │ ├── bbox │ │ ├── assigners │ │ │ ├── __init__.py │ │ │ └── hungarian_assigner_3d.py │ │ ├── bbox_merging.py │ │ ├── coders │ │ │ ├── __init__.py │ │ │ └── nms_free_coder.py │ │ ├── match_costs │ │ │ ├── __init__.py │ │ │ └── match_cost.py │ │ └── util.py │ ├── indoor_eval.py │ └── merge_all_augs.py │ ├── datasets │ ├── __init__.py │ ├── nuscenes_dataset.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── dbsampler.py │ │ ├── formatting.py │ │ ├── loading_3d.py │ │ ├── test_time_aug.py │ │ └── transform_3d.py │ └── sunrgbd_dataset_ov.py │ └── models │ ├── backbones │ ├── __init__.py │ ├── second_3d.py │ └── vovnet.py │ ├── dense_heads │ ├── __init__.py │ ├── uni3detr_head.py │ └── uni3detr_head_clip.py │ ├── detectors │ ├── __init__.py │ ├── ov_uni3detr.py │ └── uni3detr.py │ ├── losses │ ├── __init__.py │ └── rdiouloss.py │ ├── necks │ ├── __init__.py │ └── second3d_fpn.py │ ├── pts_encoder │ ├── __init__.py │ └── sparse_encoder_hd.py │ └── utils │ ├── __init__.py │ ├── grid_mask.py │ ├── uni3d_viewtrans.py │ └── uni3detr_transformer.py ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.ipynb 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # Environments 83 | .env 84 | .venv 85 | env/ 86 | venv/ 87 | ENV/ 88 | env.bak/ 89 | venv.bak/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # cython generated cpp 105 | data 106 | .vscode 107 | .idea 108 | 109 | # custom 110 | *.pkl 111 | *.pkl.json 112 | *.log.json 113 | work_dirs/ 114 | exps/ 115 | *~ 116 | mmdet3d/.mim 117 | 118 | # Pytorch 119 | *.pth 120 | 121 | # demo 122 | data/ 123 | *.obj 124 | *.ply 125 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include mmdet3d/.mim/model-index.yml 2 | include requirements/*.txt 3 | recursive-include mmdet3d/.mim/ops *.cpp *.cu *.h *.cc 4 | recursive-include mmdet3d/.mim/configs *.py *.yml 5 | recursive-include mmdet3d/.mim/tools *.sh *.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Uni3DETR & OV-Uni3DETR 3 | 4 | This includes code for: 5 | our NeurIPS 2023 paper 6 | [**Uni3DETR: Unified 3D Detection Transformer**](https://arxiv.org/pdf/2310.05699) 7 | 8 |
9 | 10 |

11 | 12 | our ECCV 2024 paper 13 | [**OV-Uni3DETR: Towards Unified Open-Vocabulary 3D Object Detection via Cycle-Modality Propagation**](https://arxiv.org/pdf/2403.19580) 14 | 15 |
16 | 17 |

18 | 19 | Uni3DETR provides a unified structure for both indoor and outdoor 3D object detection. 20 | Based on this architecture, OV-Uni3DETR further introduces multi-modal learning and open-vocabulary learning to achieve modality unifying and category unifying with a unified structure. 21 | 22 | ## Preparation 23 | This project is based on [mmDetection3D](https://github.com/open-mmlab/mmdetection3d), which can be constructed as follows. 24 | * Install mmDetection3D [v1.0.0rc5](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0rc5) following [the instructions](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc5/docs/getting_started.md). 25 | * Copy our project and related files to installed mmDetection3D: 26 | ```bash 27 | cp -r projects mmdetection3d/ 28 | cp -r extra_tools mmdetection3d/ 29 | ``` 30 | * Prepare the dataset following [mmDetection3D dataset instructions](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0rc5/docs/en/datasets). 31 | * Uni3DETR dataset preparation: 32 | 33 | SUN RGB-D dataset: 34 | The directory structure after processing should be as follows: 35 | ``` 36 | sunrgbd 37 | ├── README.md 38 | ├── matlab 39 | │ ├── ... 40 | ├── OFFICIAL_SUNRGBD 41 | │ ├── ... 42 | ├── sunrgbd_trainval 43 | │ ├── ... 44 | ├── points 45 | ├── sunrgbd_infos_train.pkl 46 | ├── sunrgbd_infos_val.pkl 47 | ``` 48 | ScanNet dataset: 49 | 50 | After downloading datasets following mmDetection3D, run ``python scripts/scannet_globalallign.py`` to perform global alignment in advance. Please note that this operation will modify the data file. If you have any concerns, it is recommended to back up the file first. 51 | 52 | The directory structure should be as below 53 | 54 | ``` 55 | scannet 56 | ├── meta_data 57 | ├── batch_load_scannet_data.py 58 | ├── load_scannet_data.py 59 | ├── scannet_utils.py 60 | ├── README.md 61 | ├── scans 62 | ├── scans_test 63 | ├── scannet_instance_data 64 | ├── points 65 | │ ├── xxxxx.bin 66 | ├── instance_mask 67 | │ ├── xxxxx.bin 68 | ├── semantic_mask 69 | │ ├── xxxxx.bin 70 | ├── seg_info 71 | │ ├── train_label_weight.npy 72 | │ ├── train_resampled_scene_idxs.npy 73 | │ ├── val_label_weight.npy 74 | │ ├── val_resampled_scene_idxs.npy 75 | ├── posed_images 76 | │ ├── scenexxxx_xx 77 | │ │ ├── xxxxxx.txt 78 | │ │ ├── xxxxxx.jpg 79 | │ │ ├── intrinsic.txt 80 | ├── scannet_infos_train.pkl 81 | ├── scannet_infos_val.pkl 82 | ├── scannet_infos_test.pkl 83 | ``` 84 | 85 | The outdoor KITTI and nuScenes datasets preparation steps are totally the same as mmDetection3D. 86 | 87 | * OV-Uni3DETR dataset preparation: 88 | 89 | SUN RGB-D dataset: 90 | 91 | The SUN RGB-D dataset preparation steps are the same as the Uni3DETR steps above, the only difference is the annotation file. The annotations file can be downloaded directly from [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing). We will upload codes about how to generate the annotation files for training soon. 92 | 93 | 94 | ## Training 95 | ```bash 96 | bash extra_tools/dist_train.sh ${CFG_FILE} ${NUM_GPUS} 97 | ``` 98 | 99 | ## Evaluation 100 | ```bash 101 | bash extra_tools/dist_test.sh ${CFG_FILE} ${CKPT} ${NUM_GPUS} --eval=bbox 102 | ``` 103 | 104 | ## Uni3DETR models 105 | We provide results on SUN RGB-D, ScanNet, KITTI, nuScenes with pretrained models (for Tab. 1, Tab. 2, Tab. 3 of our paper). 106 | | Dataset | mAP (%) | download | 107 | |---------------------------------------------|:-------:|:-------:| 108 | | **indoor** | 109 | | [SUN RGB-D](projects/configs/uni3detr/uni3detr_sunrgbd.py) | 67.0 | [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing) | 110 | | [ScanNet](projects/configs/uni3detr/uni3detr_scannet_large.py) | 71.7 | [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing) | 111 | | **outdoor** | 112 | | [KITTI (3 classes)](projects/configs/uni3detr/uni3detr_kitti_car.py) | 86.57 (moderate car) | [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing) | 113 | | [KITTI (car)](projects/configs/uni3detr/uni3detr_kitti_3classes.py) | 86.74 (moderate car) | [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing) | 114 | | [nuScenes](projects/configs/uni3detr/uni3detr_nuscenes.py) | 61.7 | [GoogleDrive](https://drive.google.com/drive/folders/1ljh6quUw5gLyHbQiY68HDGtY6QLp_d6e?usp=sharing) | 115 | 116 | -------------------------------------------------------------------------------- /docs/ovuni3detr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenyuw16/Uni3DETR/15cb08a7ddfc2e4f0ae5a5a7b9ec6a7be8175399/docs/ovuni3detr.png -------------------------------------------------------------------------------- /docs/uni3detr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenyuw16/Uni3DETR/15cb08a7ddfc2e4f0ae5a5a7b9ec6a7be8175399/docs/uni3detr.png -------------------------------------------------------------------------------- /extra_tools/analysis_tools/eval_metric.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import mmcv 5 | from mmcv import Config, DictAction 6 | 7 | from mmdet3d.datasets import build_dataset 8 | from mmdet.utils import update_data_root 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description='Evaluate metric of the ' 13 | 'results saved in pkl format') 14 | parser.add_argument('config', help='Config of the model') 15 | parser.add_argument('pkl_results', help='Results in pickle format') 16 | parser.add_argument( 17 | '--format-only', 18 | action='store_true', 19 | help='Format the output results without perform evaluation. It is' 20 | 'useful when you want to format the result to a specific format and ' 21 | 'submit it to the test server') 22 | parser.add_argument( 23 | '--eval', 24 | type=str, 25 | nargs='+', 26 | help='Evaluation metrics, which depends on the dataset, e.g., "bbox",' 27 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 28 | parser.add_argument( 29 | '--cfg-options', 30 | nargs='+', 31 | action=DictAction, 32 | help='override some settings in the used config, the key-value pair ' 33 | 'in xxx=yyy format will be merged into config file. If the value to ' 34 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 35 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 36 | 'Note that the quotation marks are necessary and that no white space ' 37 | 'is allowed.') 38 | parser.add_argument( 39 | '--eval-options', 40 | nargs='+', 41 | action=DictAction, 42 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 43 | 'format will be kwargs for dataset.evaluate() function') 44 | args = parser.parse_args() 45 | return args 46 | 47 | 48 | def main(): 49 | args = parse_args() 50 | 51 | cfg = Config.fromfile(args.config) 52 | 53 | # update data root according to MMDET_DATASETS 54 | update_data_root(cfg) 55 | 56 | # import modules from plguin/xx, registry will be updated 57 | if hasattr(cfg, 'plugin'): 58 | if cfg.plugin: 59 | import importlib 60 | if hasattr(cfg, 'plugin_dir'): 61 | plugin_dir = cfg.plugin_dir 62 | _module_dir = os.path.dirname(plugin_dir) 63 | _module_dir = _module_dir.split('/') 64 | _module_path = _module_dir[0] 65 | 66 | for m in _module_dir[1:]: 67 | _module_path = _module_path + '.' + m 68 | print(_module_path) 69 | plg_lib = importlib.import_module(_module_path) 70 | else: 71 | # import dir is the dirpath for the config file 72 | _module_dir = os.path.dirname(args.config) 73 | _module_dir = _module_dir.split('/') 74 | _module_path = _module_dir[0] 75 | for m in _module_dir[1:]: 76 | _module_path = _module_path + '.' + m 77 | print(_module_path) 78 | plg_lib = importlib.import_module(_module_path) 79 | 80 | assert args.eval or args.format_only, ( 81 | 'Please specify at least one operation (eval/format the results) with ' 82 | 'the argument "--eval", "--format-only"') 83 | if args.eval and args.format_only: 84 | raise ValueError('--eval and --format_only cannot be both specified') 85 | 86 | if args.cfg_options is not None: 87 | cfg.merge_from_dict(args.cfg_options) 88 | cfg.data.test.test_mode = True 89 | 90 | dataset = build_dataset(cfg.data.test) 91 | outputs = mmcv.load(args.pkl_results) 92 | 93 | kwargs = {} if args.eval_options is None else args.eval_options 94 | if args.format_only: 95 | dataset.format_results(outputs, **kwargs) 96 | if args.eval: 97 | eval_kwargs = cfg.get('evaluation', {}).copy() 98 | # hard-code way to remove EvalHook args 99 | for key in [ 100 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 101 | 'rule' 102 | ]: 103 | eval_kwargs.pop(key, None) 104 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 105 | print(dataset.evaluate(outputs, **eval_kwargs)) 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /extra_tools/create_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | from os import path as osp 4 | 5 | from data_converter import nuscenes_converter as nuscenes_converter 6 | from data_converter.create_unified_gt_database import create_groundtruth_database 7 | 8 | 9 | def nuscenes_data_prep(root_path, 10 | info_prefix, 11 | version, 12 | dataset_name, 13 | out_dir, 14 | max_sweeps=10): 15 | """Prepare data related to nuScenes dataset. 16 | 17 | Related data consists of '.pkl' files recording basic infos, 18 | 2D annotations and groundtruth database. 19 | 20 | Args: 21 | root_path (str): Path of dataset root. 22 | info_prefix (str): The prefix of info filenames. 23 | version (str): Dataset version. 24 | dataset_name (str): The dataset class name. 25 | out_dir (str): Output directory of the groundtruth database info. 26 | max_sweeps (int): Number of input consecutive frames. Default: 10 27 | """ 28 | #nuscenes_converter.create_nuscenes_infos( 29 | # root_path, info_prefix, version=version, max_sweeps=max_sweeps) 30 | 31 | if version == 'v1.0-test': 32 | # info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') 33 | # nuscenes_converter.export_2d_annotation( 34 | # root_path, info_test_path, version=version) 35 | return 36 | 37 | # info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') 38 | # info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') 39 | # nuscenes_converter.export_2d_annotation( 40 | # root_path, info_train_path, version=version) 41 | # nuscenes_converter.export_2d_annotation( 42 | # root_path, info_val_path, version=version) 43 | create_groundtruth_database(dataset_name, root_path, info_prefix, 44 | f'{out_dir}/{info_prefix}_infos_train.pkl') 45 | 46 | 47 | parser = argparse.ArgumentParser(description='Data converter arg parser') 48 | parser.add_argument('dataset', metavar='nuscenes', help='name of the dataset') 49 | parser.add_argument( 50 | '--root-path', 51 | type=str, 52 | default='./data/nuscenes', 53 | help='specify the root path of dataset') 54 | parser.add_argument( 55 | '--version', 56 | type=str, 57 | default='v1.0', 58 | required=False, 59 | help='specify the dataset version, no need for nuscenes') 60 | parser.add_argument( 61 | '--max-sweeps', 62 | type=int, 63 | default=10, 64 | required=False, 65 | help='specify sweeps of lidar per example') 66 | parser.add_argument( 67 | '--out-dir', 68 | type=str, 69 | default='./data/nuscenes', 70 | required='False', 71 | help='name of info pkl') 72 | parser.add_argument('--extra-tag', type=str, default='nuscenes') 73 | parser.add_argument( 74 | '--workers', type=int, default=4, help='number of threads to be used') 75 | args = parser.parse_args() 76 | 77 | if __name__ == '__main__': 78 | if args.dataset == 'nuscenes' and args.version != 'v1.0-mini': 79 | train_version = f'{args.version}-trainval' 80 | nuscenes_data_prep( 81 | root_path=args.root_path, 82 | info_prefix=args.extra_tag, 83 | version=train_version, 84 | dataset_name='NuScenesSweepDataset', 85 | out_dir=args.out_dir, 86 | max_sweeps=args.max_sweeps) 87 | test_version = f'{args.version}-test' 88 | nuscenes_data_prep( 89 | root_path=args.root_path, 90 | info_prefix=args.extra_tag, 91 | version=test_version, 92 | dataset_name='NuScenesSweepDataset', 93 | out_dir=args.out_dir, 94 | max_sweeps=args.max_sweeps) 95 | elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': 96 | train_version = f'{args.version}' 97 | nuscenes_data_prep( 98 | root_path=args.root_path, 99 | info_prefix=args.extra_tag, 100 | version=train_version, 101 | dataset_name='NuScenesSweepDataset', 102 | out_dir=args.out_dir, 103 | max_sweeps=args.max_sweeps) 104 | -------------------------------------------------------------------------------- /extra_tools/create_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | export PYTHONPATH=`pwd`:$PYTHONPATH 5 | 6 | PARTITION=$1 7 | JOB_NAME=$2 8 | CONFIG=$3 9 | WORK_DIR=$4 10 | GPUS=${GPUS:-1} 11 | GPUS_PER_NODE=${GPUS_PER_NODE:-1} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | JOB_NAME=create_data 14 | 15 | srun -p ${PARTITION} \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --kill-on-bad-exit=1 \ 21 | ${SRUN_ARGS} \ 22 | python3 -u tools/create_data.py kitti \ 23 | --root-path ./data/kitti \ 24 | --out-dir ./data/kitti \ 25 | --extra-tag kitti 26 | -------------------------------------------------------------------------------- /extra_tools/data_converter/create_unified_gt_database.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import mmcv 3 | import numpy as np 4 | import pickle 5 | import argparse 6 | import os 7 | import importlib 8 | 9 | from mmcv import track_iter_progress 10 | from os import path as osp 11 | 12 | from mmdet3d.core.bbox import box_np_ops as box_np_ops 13 | from mmdet3d.datasets import build_dataset 14 | from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps 15 | 16 | 17 | def create_groundtruth_database(dataset_class_name, 18 | data_path, 19 | info_prefix, 20 | info_path=None, 21 | used_classes=None, 22 | database_save_path=None, 23 | db_info_save_path=None, 24 | with_mask=False): 25 | """Given the raw data, generate the ground truth database. 26 | 27 | Args: 28 | dataset_class_name (str): Name of the input dataset. 29 | data_path (str): Path of the data. 30 | info_prefix (str): Prefix of the info file. 31 | info_path (str): Path of the info file. 32 | Default: None. 33 | mask_anno_path (str): Path of the mask_anno. 34 | Default: None. 35 | used_classes (list[str]): Classes have been used. 36 | Default: None. 37 | database_save_path (str): Path to save database. 38 | Default: None. 39 | db_info_save_path (str): Path to save db_info. 40 | Default: None. 41 | relative_path (bool): Whether to use relative path. 42 | Default: True. 43 | with_mask (bool): Whether to use mask. 44 | Default: False. 45 | """ 46 | print(f'Create GT Database of {dataset_class_name}') 47 | dataset_cfg = dict( 48 | type=dataset_class_name, data_root=data_path, ann_file=info_path, return_gt_info=True) 49 | if dataset_class_name == 'NuScenesSweepDataset': 50 | dataset_cfg.update( 51 | use_valid_flag=True, 52 | pipeline=[ 53 | dict( 54 | type='LoadPointsFromFile', 55 | coord_type='LIDAR', 56 | load_dim=5, 57 | use_dim=5), 58 | dict( 59 | type='LoadPointsFromMultiSweeps', 60 | sweeps_num=10, 61 | use_dim=[0, 1, 2, 3, 4], 62 | pad_empty_sweeps=True, 63 | remove_close=True), 64 | dict( 65 | type='LoadAnnotations3D', 66 | with_bbox_3d=True, 67 | with_label_3d=True) 68 | ]) 69 | 70 | dataset = build_dataset(dataset_cfg) 71 | 72 | if database_save_path is None: 73 | database_save_path = osp.join(data_path, f'{info_prefix}_gt_database') 74 | if db_info_save_path is None: 75 | db_info_save_path = osp.join(data_path, 76 | f'{info_prefix}_dbinfos_train.pkl') 77 | database_pts_path = osp.join(database_save_path, 'pts_dir') 78 | database_img_path = osp.join(database_save_path, 'img_dir') 79 | mmcv.mkdir_or_exist(database_save_path) 80 | mmcv.mkdir_or_exist(database_pts_path) 81 | mmcv.mkdir_or_exist(database_img_path) 82 | all_db_infos = dict() 83 | 84 | group_counter = 0 85 | for j in track_iter_progress(list(range(len(dataset)))): 86 | 87 | input_dict = dataset.get_data_info(j) 88 | dataset.pre_pipeline(input_dict) 89 | example = dataset.pipeline(input_dict) 90 | annos = example['ann_info'] 91 | image_idx = example['sample_idx'] 92 | points = example['points'].tensor.numpy() 93 | gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy() 94 | names = annos['gt_names'] 95 | group_dict = dict() 96 | if 'group_ids' in annos: 97 | group_ids = annos['group_ids'] 98 | else: 99 | group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) 100 | difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) 101 | if 'difficulty' in annos: 102 | difficulty = annos['difficulty'] 103 | 104 | num_obj = gt_boxes_3d.shape[0] 105 | point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) 106 | 107 | # load multi-view image 108 | input_img = {} 109 | input_info = {} 110 | for _cam in example['info']['cams']: 111 | cam_info = example['info']['cams'][_cam] 112 | _path = cam_info['data_path'] 113 | _img = mmcv.imread(_path, 'unchanged') 114 | input_img[_cam] = _img 115 | 116 | # obtain lidar to image transformation matrix 117 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) 118 | lidar2cam_t = cam_info[ 119 | 'sensor2lidar_translation'] @ lidar2cam_r.T 120 | lidar2cam_rt = np.eye(4) 121 | lidar2cam_rt[:3, :3] = lidar2cam_r.T 122 | lidar2cam_rt[3, :3] = -lidar2cam_t 123 | intrinsic = cam_info['cam_intrinsic'] 124 | viewpad = np.eye(4) 125 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic 126 | lidar2img_rt = (viewpad @ lidar2cam_rt.T) 127 | 128 | input_info[_cam]={ 129 | 'lidar2img': lidar2img_rt, 130 | 'lidar2cam': lidar2cam_rt, 131 | 'cam_intrinsic': viewpad} 132 | 133 | for i in range(num_obj): 134 | pts_filename = f'{image_idx}_{names[i]}_{i}.bin' 135 | img_filename = f'{image_idx}_{names[i]}_{i}.png' 136 | abs_filepath = osp.join(database_pts_path, pts_filename) 137 | abs_img_filepath = osp.join(database_img_path, img_filename) 138 | rel_filepath = osp.join(f'{info_prefix}_gt_database', 'pts_dir', pts_filename) 139 | rel_img_filepath = osp.join(f'{info_prefix}_gt_database', 'img_dir', img_filename) 140 | 141 | # save point clouds and image patches for each object 142 | gt_points = points[point_indices[:, i]] 143 | gt_points[:, :3] -= gt_boxes_3d[i, :3] 144 | 145 | # with open(abs_filepath, 'w') as f: 146 | # gt_points.tofile(f) 147 | 148 | img_crop, crop_key, crop_depth = find_img_crop(annos['gt_bboxes_3d'][i].corners.numpy(), input_img, input_info, points[point_indices[:, i]]) 149 | if img_crop is not None: 150 | mmcv.imwrite(img_crop, abs_img_filepath) 151 | 152 | if (used_classes is None) or names[i] in used_classes: 153 | db_info = { 154 | 'name': names[i], 155 | 'path': rel_filepath, 156 | 'image_idx': image_idx, 157 | 'image_path': rel_img_filepath if img_crop is not None else '', 158 | 'image_crop_key': crop_key if img_crop is not None else '', 159 | 'image_crop_depth': crop_depth, 160 | 'gt_idx': i, 161 | 'box3d_lidar': gt_boxes_3d[i], 162 | 'num_points_in_gt': gt_points.shape[0], 163 | 'difficulty': difficulty[i], 164 | } 165 | local_group_id = group_ids[i] 166 | # if local_group_id >= 0: 167 | if local_group_id not in group_dict: 168 | group_dict[local_group_id] = group_counter 169 | group_counter += 1 170 | db_info['group_id'] = group_dict[local_group_id] 171 | if 'score' in annos: 172 | db_info['score'] = annos['score'][i] 173 | if names[i] in all_db_infos: 174 | all_db_infos[names[i]].append(db_info) 175 | else: 176 | all_db_infos[names[i]] = [db_info] 177 | 178 | for k, v in all_db_infos.items(): 179 | print(f'load {len(v)} {k} database infos') 180 | 181 | with open(db_info_save_path, 'wb') as f: 182 | pickle.dump(all_db_infos, f) 183 | 184 | 185 | def find_img_crop(gt_boxes_3d, input_img, input_info, points): 186 | coord_3d = np.concatenate([gt_boxes_3d, np.ones_like(gt_boxes_3d[..., :1])], -1) 187 | coord_3d = coord_3d.squeeze(0) 188 | max_crop, crop_key = None, None 189 | crop_area, crop_depth = 0, 0 190 | 191 | for _key in input_img: 192 | lidar2img = np.array(input_info[_key]['lidar2img']) 193 | coord_img = coord_3d @ lidar2img.T 194 | coord_img[:,:2] /= coord_img[:,2,None] 195 | image_shape = input_img[_key].shape 196 | if (coord_img[2] <= 0).any(): 197 | continue 198 | 199 | avg_depth = coord_img[:,2].mean() 200 | minxy = np.min(coord_img[:,:2], axis=-2) 201 | maxxy = np.max(coord_img[:,:2], axis=-2) 202 | bbox = np.concatenate([minxy, maxxy], axis=-1) 203 | bbox[0::2] = np.clip(bbox[0::2], a_min=0, a_max=image_shape[1]-1) 204 | bbox[1::2] = np.clip(bbox[1::2], a_min=0, a_max=image_shape[0]-1) 205 | bbox = bbox.astype(int) 206 | if ((bbox[2:]-bbox[:2]) <= 10).any(): 207 | continue 208 | 209 | img_crop = input_img[_key][bbox[1]:bbox[3],bbox[0]:bbox[2]] 210 | if img_crop.shape[0] * img_crop.shape[1] > crop_area: 211 | max_crop = img_crop 212 | crop_key = _key 213 | crop_depth = avg_depth 214 | 215 | return max_crop, crop_key, crop_depth 216 | 217 | 218 | if __name__ == '__main__': 219 | parser = argparse.ArgumentParser(description='Data converter arg parser') 220 | parser.add_argument( 221 | '--dataset', 222 | type=str, 223 | default='NuScenesSweepDataset', 224 | required=False, 225 | help='specify dataset name') 226 | parser.add_argument( 227 | '--root-path', 228 | type=str, 229 | default='./data/nuscenes', 230 | help='specify the root path of dataset') 231 | parser.add_argument( 232 | '--version', 233 | type=str, 234 | default='v1.0', 235 | required=False, 236 | help='specify the dataset version, no need for kitti') 237 | parser.add_argument( 238 | '--out-dir', 239 | type=str, 240 | default='./data/nuscenes', 241 | required=False, 242 | help='output data dir') 243 | parser.add_argument( 244 | '--info-path', 245 | type=str, 246 | default='./data/nuscenes/nuscenes_img_pro_infos_train.pkl', 247 | required=False, 248 | help='name of info pkl') 249 | parser.add_argument('--extra-tag', type=str, default='nuscenes_unified') 250 | args = parser.parse_args() 251 | 252 | plugin_dir = 'projects/mmdet3d_plugin/' 253 | _module_dir = os.path.dirname(plugin_dir) 254 | _module_dir = _module_dir.split('/') 255 | _module_path = _module_dir[0] 256 | 257 | for m in _module_dir[1:]: 258 | _module_path = _module_path + '.' + m 259 | print(_module_path) 260 | plg_lib = importlib.import_module(_module_path) 261 | 262 | create_groundtruth_database(args.dataset, args.root_path, args.extra_tag, 263 | args.info_path) -------------------------------------------------------------------------------- /extra_tools/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29503} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /extra_tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29501} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /extra_tools/eval_metric.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import mmcv 5 | from mmcv import Config, DictAction 6 | 7 | from mmdet3d.datasets import build_dataset 8 | from mmdet.utils import update_data_root 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description='Evaluate metric of the ' 13 | 'results saved in pkl format') 14 | parser.add_argument('config', help='Config of the model') 15 | parser.add_argument('pkl_results', help='Results in pickle format') 16 | parser.add_argument( 17 | '--format-only', 18 | action='store_true', 19 | help='Format the output results without perform evaluation. It is' 20 | 'useful when you want to format the result to a specific format and ' 21 | 'submit it to the test server') 22 | parser.add_argument( 23 | '--eval', 24 | type=str, 25 | nargs='+', 26 | help='Evaluation metrics, which depends on the dataset, e.g., "bbox",' 27 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 28 | parser.add_argument( 29 | '--cfg-options', 30 | nargs='+', 31 | action=DictAction, 32 | help='override some settings in the used config, the key-value pair ' 33 | 'in xxx=yyy format will be merged into config file. If the value to ' 34 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 35 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 36 | 'Note that the quotation marks are necessary and that no white space ' 37 | 'is allowed.') 38 | parser.add_argument( 39 | '--eval-options', 40 | nargs='+', 41 | action=DictAction, 42 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 43 | 'format will be kwargs for dataset.evaluate() function') 44 | args = parser.parse_args() 45 | return args 46 | 47 | 48 | def main(): 49 | args = parse_args() 50 | 51 | cfg = Config.fromfile(args.config) 52 | 53 | # update data root according to MMDET_DATASETS 54 | update_data_root(cfg) 55 | 56 | import importlib 57 | if hasattr(cfg, 'plugin_dir'): 58 | plugin_dir = cfg.plugin_dir 59 | _module_dir = os.path.dirname(plugin_dir) 60 | _module_dir = _module_dir.split('/') 61 | _module_path = _module_dir[0] 62 | 63 | for m in _module_dir[1:]: 64 | _module_path = _module_path + '.' + m 65 | print(_module_path) 66 | plg_lib = importlib.import_module(_module_path) 67 | 68 | assert args.eval or args.format_only, ( 69 | 'Please specify at least one operation (eval/format the results) with ' 70 | 'the argument "--eval", "--format-only"') 71 | if args.eval and args.format_only: 72 | raise ValueError('--eval and --format_only cannot be both specified') 73 | 74 | if args.cfg_options is not None: 75 | cfg.merge_from_dict(args.cfg_options) 76 | cfg.data.test.test_mode = True 77 | 78 | dataset = build_dataset(cfg.data.test) 79 | outputs = mmcv.load(args.pkl_results) 80 | 81 | kwargs = {} if args.eval_options is None else args.eval_options 82 | if args.format_only: 83 | dataset.format_results(outputs, **kwargs) 84 | if args.eval: 85 | eval_kwargs = cfg.get('evaluation', {}).copy() 86 | # hard-code way to remove EvalHook args 87 | for key in [ 88 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 89 | 'rule' 90 | ]: 91 | eval_kwargs.pop(key, None) 92 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 93 | print(dataset.evaluate(outputs, **eval_kwargs)) 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | -------------------------------------------------------------------------------- /extra_tools/get_flops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import os 4 | import torch 5 | from mmcv import Config, DictAction 6 | 7 | from mmdet3d.models import build_model 8 | 9 | try: 10 | from mmcv.cnn import get_model_complexity_info 11 | except ImportError: 12 | raise ImportError('Please upgrade mmcv to >0.6.2') 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Train a detector') 17 | parser.add_argument('config', help='train config file path') 18 | parser.add_argument( 19 | '--shape', 20 | type=int, 21 | nargs='+', 22 | default=[40000, 5], 23 | help='input point cloud size') 24 | parser.add_argument( 25 | '--modality', 26 | type=str, 27 | default='point', 28 | choices=['point', 'image', 'multi'], 29 | help='input data modality') 30 | parser.add_argument( 31 | '--cfg-options', 32 | nargs='+', 33 | action=DictAction, 34 | help='override some settings in the used config, the key-value pair ' 35 | 'in xxx=yyy format will be merged into config file. If the value to ' 36 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 37 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 38 | 'Note that the quotation marks are necessary and that no white space ' 39 | 'is allowed.') 40 | args = parser.parse_args() 41 | return args 42 | 43 | 44 | def main(): 45 | 46 | args = parse_args() 47 | 48 | if args.modality == 'point': 49 | assert len(args.shape) == 2, 'invalid input shape' 50 | input_shape = tuple(args.shape) 51 | elif args.modality == 'image': 52 | if len(args.shape) == 1: 53 | input_shape = (3, args.shape[0], args.shape[0]) 54 | elif len(args.shape) == 2: 55 | input_shape = (3, ) + tuple(args.shape) 56 | else: 57 | raise ValueError('invalid input shape') 58 | elif args.modality == 'multi': 59 | raise NotImplementedError( 60 | 'FLOPs counter is currently not supported for models with ' 61 | 'multi-modality input') 62 | 63 | cfg = Config.fromfile(args.config) 64 | if args.cfg_options is not None: 65 | cfg.merge_from_dict(args.cfg_options) 66 | 67 | if hasattr(cfg, 'plugin'): 68 | if cfg.plugin: 69 | import importlib 70 | if hasattr(cfg, 'plugin_dir'): 71 | plugin_dir = cfg.plugin_dir 72 | _module_dir = os.path.dirname(plugin_dir) 73 | _module_dir = _module_dir.split('/') 74 | _module_path = _module_dir[0] 75 | 76 | for m in _module_dir[1:]: 77 | _module_path = _module_path + '.' + m 78 | print(_module_path) 79 | plg_lib = importlib.import_module(_module_path) 80 | else: 81 | # import dir is the dirpath for the config file 82 | _module_dir = os.path.dirname(args.config) 83 | _module_dir = _module_dir.split('/') 84 | _module_path = _module_dir[0] 85 | for m in _module_dir[1:]: 86 | _module_path = _module_path + '.' + m 87 | print(_module_path) 88 | plg_lib = importlib.import_module(_module_path) 89 | 90 | # set cudnn_benchmark 91 | if cfg.get('cudnn_benchmark', False): 92 | torch.backends.cudnn.benchmark = True 93 | 94 | # work_dir is determined in this priority: CLI > segment in file > filename 95 | #if args.work_dir is not None: 96 | # update configs according to CLI args if args.work_dir is not None 97 | 98 | model = build_model( 99 | cfg.model, 100 | train_cfg=cfg.get('train_cfg'), 101 | test_cfg=cfg.get('test_cfg')) 102 | if torch.cuda.is_available(): 103 | model.cuda() 104 | model.eval() 105 | 106 | if hasattr(model, 'forward_dummy'): 107 | model.forward = model.forward_dummy 108 | else: 109 | raise NotImplementedError( 110 | 'FLOPs counter is currently not supported for {}'.format( 111 | model.__class__.__name__)) 112 | 113 | flops, params = get_model_complexity_info(model, input_shape) 114 | split_line = '=' * 30 115 | print(f'{split_line}\nInput shape: {input_shape}\n' 116 | f'Flops: {flops}\nParams: {params}\n{split_line}') 117 | print('!!!Please be cautious if you use the results in papers. ' 118 | 'You may need to check if all ops are supported and verify that the ' 119 | 'flops computation is correct.') 120 | 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /extra_tools/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | import mmcv 4 | import os 5 | import torch 6 | import warnings 7 | from mmcv import Config, DictAction 8 | from mmcv.cnn import fuse_conv_bn 9 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 10 | from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, 11 | wrap_fp16_model) 12 | 13 | from mmdet3d.apis import single_gpu_test 14 | from mmdet3d.datasets import build_dataloader, build_dataset 15 | from mmdet3d.models import build_model 16 | from mmdet.apis import multi_gpu_test, set_random_seed 17 | from mmdet.datasets import replace_ImageToTensor 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser( 21 | description='MMDet test (and eval) a model') 22 | parser.add_argument('config', help='test config file path') 23 | parser.add_argument('checkpoint', help='checkpoint file') 24 | parser.add_argument('--out', help='output result file in pickle format') 25 | parser.add_argument( 26 | '--fuse-conv-bn', 27 | action='store_true', 28 | help='Whether to fuse conv and bn, this will slightly increase' 29 | 'the inference speed') 30 | parser.add_argument( 31 | '--format-only', 32 | action='store_true', 33 | help='Format the output results without perform evaluation. It is' 34 | 'useful when you want to format the result to a specific format and ' 35 | 'submit it to the test server') 36 | parser.add_argument( 37 | '--eval', 38 | type=str, 39 | nargs='+', 40 | help='evaluation metrics, which depends on the dataset, e.g., "bbox",' 41 | ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') 42 | parser.add_argument('--show', action='store_true', help='show results') 43 | parser.add_argument( 44 | '--show-dir', help='directory where results will be saved') 45 | parser.add_argument( 46 | '--gpu-collect', 47 | action='store_true', 48 | help='whether to use gpu to collect results.') 49 | parser.add_argument( 50 | '--tmpdir', 51 | help='tmp directory used for collecting results from multiple ' 52 | 'workers, available when gpu-collect is not specified') 53 | parser.add_argument('--seed', type=int, default=0, help='random seed') 54 | parser.add_argument( 55 | '--deterministic', 56 | action='store_true', 57 | help='whether to set deterministic options for CUDNN backend.') 58 | parser.add_argument( 59 | '--cfg-options', 60 | nargs='+', 61 | action=DictAction, 62 | help='override some settings in the used config, the key-value pair ' 63 | 'in xxx=yyy format will be merged into config file. If the value to ' 64 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 65 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 66 | 'Note that the quotation marks are necessary and that no white space ' 67 | 'is allowed.') 68 | parser.add_argument( 69 | '--options', 70 | nargs='+', 71 | action=DictAction, 72 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 73 | 'format will be kwargs for dataset.evaluate() function (deprecate), ' 74 | 'change to --eval-options instead.') 75 | parser.add_argument( 76 | '--eval-options', 77 | nargs='+', 78 | action=DictAction, 79 | help='custom options for evaluation, the key-value pair in xxx=yyy ' 80 | 'format will be kwargs for dataset.evaluate() function') 81 | parser.add_argument( 82 | '--launcher', 83 | choices=['none', 'pytorch', 'slurm', 'mpi'], 84 | default='none', 85 | help='job launcher') 86 | parser.add_argument('--local_rank', type=int, default=0) 87 | args = parser.parse_args() 88 | if 'LOCAL_RANK' not in os.environ: 89 | os.environ['LOCAL_RANK'] = str(args.local_rank) 90 | 91 | if args.options and args.eval_options: 92 | raise ValueError( 93 | '--options and --eval-options cannot be both specified, ' 94 | '--options is deprecated in favor of --eval-options') 95 | if args.options: 96 | warnings.warn('--options is deprecated in favor of --eval-options') 97 | args.eval_options = args.options 98 | return args 99 | 100 | 101 | def main(): 102 | args = parse_args() 103 | 104 | assert args.out or args.eval or args.format_only or args.show \ 105 | or args.show_dir, \ 106 | ('Please specify at least one operation (save/eval/format/show the ' 107 | 'results / save the results) with the argument "--out", "--eval"' 108 | ', "--format-only", "--show" or "--show-dir"') 109 | 110 | if args.eval and args.format_only: 111 | raise ValueError('--eval and --format_only cannot be both specified') 112 | 113 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 114 | raise ValueError('The output file must be a pkl file.') 115 | 116 | cfg = Config.fromfile(args.config) 117 | if args.cfg_options is not None: 118 | cfg.merge_from_dict(args.cfg_options) 119 | # import modules from string list. 120 | if cfg.get('custom_imports', None): 121 | from mmcv.utils import import_modules_from_strings 122 | import_modules_from_strings(**cfg['custom_imports']) 123 | 124 | # import modules from plguin/xx, registry will be updated 125 | if hasattr(cfg, 'plugin'): 126 | if cfg.plugin: 127 | import importlib 128 | if hasattr(cfg, 'plugin_dir'): 129 | plugin_dir = cfg.plugin_dir 130 | _module_dir = os.path.dirname(plugin_dir) 131 | _module_dir = _module_dir.split('/') 132 | _module_path = _module_dir[0] 133 | 134 | for m in _module_dir[1:]: 135 | _module_path = _module_path + '.' + m 136 | print(_module_path) 137 | plg_lib = importlib.import_module(_module_path) 138 | else: 139 | # import dir is the dirpath for the config file 140 | _module_dir = os.path.dirname(args.config) 141 | _module_dir = _module_dir.split('/') 142 | _module_path = _module_dir[0] 143 | for m in _module_dir[1:]: 144 | _module_path = _module_path + '.' + m 145 | print(_module_path) 146 | plg_lib = importlib.import_module(_module_path) 147 | 148 | # set cudnn_benchmark 149 | if cfg.get('cudnn_benchmark', False): 150 | torch.backends.cudnn.benchmark = True 151 | 152 | cfg.model.pretrained = None 153 | # in case the test dataset is concatenated 154 | samples_per_gpu = 1 155 | if isinstance(cfg.data.test, dict): 156 | cfg.data.test.test_mode = True 157 | samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) 158 | if samples_per_gpu > 1: 159 | # Replace 'ImageToTensor' to 'DefaultFormatBundle' 160 | cfg.data.test.pipeline = replace_ImageToTensor( 161 | cfg.data.test.pipeline) 162 | elif isinstance(cfg.data.test, list): 163 | for ds_cfg in cfg.data.test: 164 | ds_cfg.test_mode = True 165 | samples_per_gpu = max( 166 | [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) 167 | if samples_per_gpu > 1: 168 | for ds_cfg in cfg.data.test: 169 | ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) 170 | 171 | # init distributed env first, since logger depends on the dist info. 172 | if args.launcher == 'none': 173 | distributed = False 174 | else: 175 | distributed = True 176 | init_dist(args.launcher, **cfg.dist_params) 177 | 178 | # set random seeds 179 | if args.seed is not None: 180 | set_random_seed(args.seed, deterministic=args.deterministic) 181 | 182 | # build the dataloader 183 | dataset = build_dataset(cfg.data.test) 184 | data_loader = build_dataloader( 185 | dataset, 186 | samples_per_gpu=samples_per_gpu, 187 | workers_per_gpu=cfg.data.workers_per_gpu, 188 | dist=distributed, 189 | shuffle=False) 190 | 191 | # build the model and load checkpoint 192 | cfg.model.train_cfg = None 193 | model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) 194 | fp16_cfg = cfg.get('fp16', None) 195 | if fp16_cfg is not None: 196 | wrap_fp16_model(model) 197 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 198 | if args.fuse_conv_bn: 199 | model = fuse_conv_bn(model) 200 | # old versions did not save class info in checkpoints, this walkaround is 201 | # for backward compatibility 202 | if 'CLASSES' in checkpoint.get('meta', {}): 203 | model.CLASSES = checkpoint['meta']['CLASSES'] 204 | else: 205 | model.CLASSES = dataset.CLASSES 206 | # palette for visualization in segmentation tasks 207 | if 'PALETTE' in checkpoint.get('meta', {}): 208 | model.PALETTE = checkpoint['meta']['PALETTE'] 209 | elif hasattr(dataset, 'PALETTE'): 210 | # segmentation dataset has `PALETTE` attribute 211 | model.PALETTE = dataset.PALETTE 212 | 213 | if not distributed: 214 | model = MMDataParallel(model, device_ids=[0]) 215 | outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) 216 | else: 217 | model = MMDistributedDataParallel( 218 | model.cuda(), 219 | device_ids=[torch.cuda.current_device()], 220 | broadcast_buffers=False) 221 | outputs = multi_gpu_test(model, data_loader, args.tmpdir, 222 | args.gpu_collect) 223 | 224 | rank, _ = get_dist_info() 225 | if rank == 0: 226 | if args.out: 227 | print(f'\nwriting results to {args.out}') 228 | mmcv.dump(outputs, args.out) 229 | kwargs = {} if args.eval_options is None else args.eval_options 230 | if args.format_only: 231 | dataset.format_results(outputs, **kwargs) 232 | if args.eval: 233 | eval_kwargs = cfg.get('evaluation', {}).copy() 234 | # hard-code way to remove EvalHook args 235 | for key in [ 236 | 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 237 | 'rule' 238 | ]: 239 | eval_kwargs.pop(key, None) 240 | eval_kwargs.update(dict(metric=args.eval, **kwargs)) 241 | print(dataset.evaluate(outputs, **eval_kwargs)) 242 | 243 | 244 | if __name__ == '__main__': 245 | warnings.filterwarnings("ignore") 246 | torch.multiprocessing.set_start_method('fork') 247 | main() 248 | -------------------------------------------------------------------------------- /extra_tools/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from __future__ import division 3 | 4 | import argparse 5 | import copy 6 | import mmcv 7 | import os 8 | import time 9 | import torch 10 | import warnings 11 | from mmcv import Config, DictAction 12 | from mmcv.runner import get_dist_info, init_dist 13 | from os import path as osp 14 | 15 | from mmdet import __version__ as mmdet_version 16 | from mmdet3d import __version__ as mmdet3d_version 17 | from mmdet3d.apis import train_model 18 | from mmdet3d.datasets import build_dataset 19 | from mmdet3d.models import build_model 20 | from mmdet3d.utils import collect_env, get_root_logger 21 | from mmdet.apis import set_random_seed 22 | from mmseg import __version__ as mmseg_version 23 | 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser(description='Train a detector') 27 | parser.add_argument('config', help='train config file path') 28 | parser.add_argument('--work-dir', help='the dir to save logs and models') 29 | parser.add_argument( 30 | '--resume-from', help='the checkpoint file to resume from') 31 | parser.add_argument( 32 | '--no-validate', 33 | action='store_true', 34 | help='whether not to evaluate the checkpoint during training') 35 | group_gpus = parser.add_mutually_exclusive_group() 36 | group_gpus.add_argument( 37 | '--gpus', 38 | type=int, 39 | help='number of gpus to use ' 40 | '(only applicable to non-distributed training)') 41 | group_gpus.add_argument( 42 | '--gpu-ids', 43 | type=int, 44 | nargs='+', 45 | help='ids of gpus to use ' 46 | '(only applicable to non-distributed training)') 47 | parser.add_argument('--seed', type=int, default=0, help='random seed') 48 | parser.add_argument( 49 | '--deterministic', 50 | action='store_true', 51 | help='whether to set deterministic options for CUDNN backend.') 52 | parser.add_argument( 53 | '--options', 54 | nargs='+', 55 | action=DictAction, 56 | help='override some settings in the used config, the key-value pair ' 57 | 'in xxx=yyy format will be merged into config file (deprecate), ' 58 | 'change to --cfg-options instead.') 59 | parser.add_argument( 60 | '--cfg-options', 61 | nargs='+', 62 | action=DictAction, 63 | help='override some settings in the used config, the key-value pair ' 64 | 'in xxx=yyy format will be merged into config file. If the value to ' 65 | 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 66 | 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 67 | 'Note that the quotation marks are necessary and that no white space ' 68 | 'is allowed.') 69 | parser.add_argument( 70 | '--launcher', 71 | choices=['none', 'pytorch', 'slurm', 'mpi'], 72 | default='none', 73 | help='job launcher') 74 | parser.add_argument('--local_rank', type=int, default=0) 75 | parser.add_argument( 76 | '--autoscale-lr', 77 | action='store_true', 78 | help='automatically scale lr with the number of gpus') 79 | args = parser.parse_args() 80 | if 'LOCAL_RANK' not in os.environ: 81 | os.environ['LOCAL_RANK'] = str(args.local_rank) 82 | 83 | if args.options and args.cfg_options: 84 | raise ValueError( 85 | '--options and --cfg-options cannot be both specified, ' 86 | '--options is deprecated in favor of --cfg-options') 87 | if args.options: 88 | warnings.warn('--options is deprecated in favor of --cfg-options') 89 | args.cfg_options = args.options 90 | 91 | return args 92 | 93 | 94 | def main(): 95 | args = parse_args() 96 | 97 | cfg = Config.fromfile(args.config) 98 | if args.cfg_options is not None: 99 | cfg.merge_from_dict(args.cfg_options) 100 | # import modules from string list. 101 | if cfg.get('custom_imports', None): 102 | from mmcv.utils import import_modules_from_strings 103 | import_modules_from_strings(**cfg['custom_imports']) 104 | 105 | # import modules from plguin/xx, registry will be updated 106 | if hasattr(cfg, 'plugin'): 107 | if cfg.plugin: 108 | import importlib 109 | if hasattr(cfg, 'plugin_dir'): 110 | plugin_dir = cfg.plugin_dir 111 | _module_dir = os.path.dirname(plugin_dir) 112 | _module_dir = _module_dir.split('/') 113 | _module_path = _module_dir[0] 114 | 115 | for m in _module_dir[1:]: 116 | _module_path = _module_path + '.' + m 117 | print(_module_path) 118 | plg_lib = importlib.import_module(_module_path) 119 | else: 120 | # import dir is the dirpath for the config file 121 | _module_dir = os.path.dirname(args.config) 122 | _module_dir = _module_dir.split('/') 123 | _module_path = _module_dir[0] 124 | for m in _module_dir[1:]: 125 | _module_path = _module_path + '.' + m 126 | print(_module_path) 127 | plg_lib = importlib.import_module(_module_path) 128 | 129 | # set cudnn_benchmark 130 | if cfg.get('cudnn_benchmark', False): 131 | torch.backends.cudnn.benchmark = True 132 | 133 | # work_dir is determined in this priority: CLI > segment in file > filename 134 | if args.work_dir is not None: 135 | # update configs according to CLI args if args.work_dir is not None 136 | cfg.work_dir = args.work_dir 137 | elif cfg.get('work_dir', None) is None: 138 | # use config filename as default work_dir if cfg.work_dir is None 139 | cfg.work_dir = osp.join('./work_dirs', 140 | osp.splitext(osp.basename(args.config))[0]) 141 | if args.resume_from is not None: 142 | cfg.resume_from = args.resume_from 143 | if args.gpu_ids is not None: 144 | cfg.gpu_ids = args.gpu_ids 145 | else: 146 | cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) 147 | 148 | if args.autoscale_lr: 149 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) 150 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 151 | 152 | # init distributed env first, since logger depends on the dist info. 153 | if args.launcher == 'none': 154 | distributed = False 155 | else: 156 | distributed = True 157 | init_dist(args.launcher, **cfg.dist_params) 158 | # re-set gpu_ids with distributed training mode 159 | _, world_size = get_dist_info() 160 | cfg.gpu_ids = range(world_size) 161 | 162 | # create work_dir 163 | mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) 164 | # dump config 165 | cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) 166 | # init the logger before other steps 167 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 168 | log_file = osp.join(cfg.work_dir, f'{timestamp}.log') 169 | # specify logger name, if we still use 'mmdet', the output info will be 170 | # filtered and won't be saved in the log_file 171 | # TODO: ugly workaround to judge whether we are training det or seg model 172 | if cfg.model.type in ['EncoderDecoder3D']: 173 | logger_name = 'mmseg' 174 | else: 175 | logger_name = 'mmdet' 176 | logger = get_root_logger( 177 | log_file=log_file, log_level=cfg.log_level, name=logger_name) 178 | 179 | # init the meta dict to record some important information such as 180 | # environment info and seed, which will be logged 181 | meta = dict() 182 | # log env info 183 | env_info_dict = collect_env() 184 | env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) 185 | dash_line = '-' * 60 + '\n' 186 | logger.info('Environment info:\n' + dash_line + env_info + '\n' + 187 | dash_line) 188 | meta['env_info'] = env_info 189 | meta['config'] = cfg.pretty_text 190 | 191 | # log some basic info 192 | logger.info(f'Distributed training: {distributed}') 193 | logger.info(f'Config:\n{cfg.pretty_text}') 194 | 195 | # set random seeds 196 | if args.seed is not None: 197 | logger.info(f'Set random seed to {args.seed}, ' 198 | f'deterministic: {args.deterministic}') 199 | set_random_seed(args.seed, deterministic=args.deterministic) 200 | cfg.seed = args.seed 201 | meta['seed'] = args.seed 202 | meta['exp_name'] = osp.basename(args.config) 203 | 204 | model = build_model( 205 | cfg.model, 206 | train_cfg=cfg.get('train_cfg'), 207 | test_cfg=cfg.get('test_cfg')) 208 | model.init_weights() 209 | 210 | logger.info(f'Model:\n{model}') 211 | datasets = [build_dataset(cfg.data.train)] 212 | if len(cfg.workflow) == 2: 213 | val_dataset = copy.deepcopy(cfg.data.val) 214 | # in case we use a dataset wrapper 215 | if 'dataset' in cfg.data.train: 216 | val_dataset.pipeline = cfg.data.train.dataset.pipeline 217 | else: 218 | val_dataset.pipeline = cfg.data.train.pipeline 219 | # set test_mode=False here in deep copied config 220 | # which do not affect AP/AR calculation later 221 | # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa 222 | val_dataset.test_mode = False 223 | datasets.append(build_dataset(val_dataset)) 224 | if cfg.checkpoint_config is not None: 225 | # save mmdet version, config file content and class names in 226 | # checkpoints as meta data 227 | cfg.checkpoint_config.meta = dict( 228 | mmdet_version=mmdet_version, 229 | mmseg_version=mmseg_version, 230 | mmdet3d_version=mmdet3d_version, 231 | config=cfg.pretty_text, 232 | CLASSES=datasets[0].CLASSES, 233 | PALETTE=datasets[0].PALETTE # for segmentors 234 | if hasattr(datasets[0], 'PALETTE') else None) 235 | # add an attribute for visualization convenience 236 | model.CLASSES = datasets[0].CLASSES 237 | 238 | # for v in model.parameters(): 239 | # v.requires_grad = False 240 | # for v in model.pts_backbone.parameters(): 241 | # v.requires_grad = False 242 | # for v in model.pts_neck.parameters(): 243 | # v.requires_grad = False 244 | # for v in model.pts_middle_encoder.parameters(): 245 | # v.requires_grad = False 246 | 247 | train_model( 248 | model, 249 | datasets, 250 | cfg, 251 | distributed=distributed, 252 | validate=(not args.no_validate), 253 | timestamp=timestamp, 254 | meta=meta) 255 | 256 | 257 | if __name__ == '__main__': 258 | torch.multiprocessing.set_start_method('fork') 259 | main() 260 | -------------------------------------------------------------------------------- /model-index.yml: -------------------------------------------------------------------------------- 1 | Import: 2 | - configs/3dssd/metafile.yml 3 | - configs/centerpoint/metafile.yml 4 | - configs/dynamic_voxelization/metafile.yml 5 | - configs/fcaf3d/metafile.yml 6 | - configs/fcos3d/metafile.yml 7 | - configs/free_anchor/metafile.yml 8 | - configs/groupfree3d/metafile.yml 9 | - configs/h3dnet/metafile.yml 10 | - configs/imvotenet/metafile.yml 11 | - configs/imvoxelnet/metafile.yml 12 | - configs/mvxnet/metafile.yml 13 | - configs/nuimages/metafile.yml 14 | - configs/parta2/metafile.yml 15 | - configs/pgd/metafile.yml 16 | - configs/pointnet2/metafile.yml 17 | - configs/pointpillars/metafile.yml 18 | - configs/regnet/metafile.yml 19 | - configs/second/metafile.yml 20 | - configs/smoke/metafile.yml 21 | - configs/ssn/metafile.yml 22 | - configs/votenet/metafile.yml 23 | -------------------------------------------------------------------------------- /projects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenyuw16/Uni3DETR/15cb08a7ddfc2e4f0ae5a5a7b9ec6a7be8175399/projects/__init__.py -------------------------------------------------------------------------------- /projects/configs/ov_uni3detr/ov_uni3detr_sunrgbd_pc.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | # If point cloud range is changed, the models should also change their point 9 | # cloud range accordingly 10 | voxel_size = [0.02, 0.02, 0.02] 11 | grid_size = [128, 320, 320] 12 | point_cloud_range = [-3.2, -0.2, -2., 3.2, 6.2, 0.56] 13 | 14 | fp16_enabled = False 15 | bev_stride = 4 16 | sample_num = 5 17 | 18 | 19 | input_modality = dict( 20 | use_lidar=True, 21 | use_camera=False, 22 | use_radar=False, 23 | use_map=False, 24 | use_external=False) 25 | 26 | model = dict( 27 | type='OV_Uni3DETR', 28 | pts_voxel_layer=dict( 29 | max_num_points=5, voxel_size=voxel_size, max_voxels=(16000, 40000), 30 | point_cloud_range=point_cloud_range), 31 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=4), 32 | pts_middle_encoder=dict( 33 | type='SparseEncoderHD', 34 | in_channels=4, 35 | sparse_shape=grid_size, 36 | output_channels=256, 37 | order=('conv', 'norm', 'act'), 38 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 39 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 40 | block_type='basicblock', 41 | fp16_enabled=False), # not enable FP16 here 42 | pts_backbone=dict( 43 | type='SECOND3D', 44 | in_channels=[256, 256, 256], 45 | out_channels=[128, 256, 512], 46 | layer_nums=[5, 5, 5], 47 | layer_strides=[1, 2, 4], 48 | is_cascade=False, 49 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 50 | conv_cfg=dict(type='Conv3d', kernel=(1,3,3), bias=False)), 51 | pts_neck=dict( 52 | type='SECOND3DFPN', 53 | in_channels=[128, 256, 512], 54 | out_channels=[256, 256, 256], 55 | upsample_strides=[1, 2, 4], 56 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 57 | upsample_cfg=dict(type='deconv3d', bias=False), 58 | extra_conv=dict(type='Conv3d', num_conv=3, bias=False), 59 | use_conv_for_no_stride=True), 60 | pts_bbox_head=dict( 61 | type='Uni3DETRHeadCLIP', 62 | num_query=300, 63 | zeroshot_path='clip_embed/sunrgbd_clip_a+cname_rn50_manyprompt_46c_coda.npy', 64 | num_classes=46, 65 | in_channels=256, 66 | sync_cls_avg_factor=True, 67 | with_box_refine=True, 68 | as_two_stage=False, 69 | code_size=8, 70 | transformer=dict( 71 | type='Uni3DETRTransformer', 72 | fp16_enabled=fp16_enabled, 73 | decoder=dict( 74 | type='Uni3DETRTransformerDecoder', 75 | num_layers=3, 76 | return_intermediate=True, 77 | transformerlayers=dict( 78 | type='BaseTransformerLayer', 79 | attn_cfgs=[ 80 | dict( 81 | type='MultiheadAttention', 82 | embed_dims=256, 83 | num_heads=8, 84 | dropout=0.1), 85 | dict( 86 | type='UniCrossAtten', 87 | num_points=1, 88 | embed_dims=256, 89 | num_sweeps=1, 90 | fp16_enabled=fp16_enabled), 91 | ], 92 | ffn_cfgs=dict( 93 | type='FFN', 94 | embed_dims=256, 95 | feedforward_channels=512, 96 | num_fcs=2, 97 | ffn_drop=0.1, 98 | act_cfg=dict(type='ReLU', inplace=True), 99 | ), 100 | norm_cfg=dict(type='LN'), 101 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')) 102 | ) 103 | ), 104 | bbox_coder=dict( 105 | type='NMSFreeCoder', 106 | post_center_range=point_cloud_range, 107 | pc_range=point_cloud_range, 108 | max_num=1000, 109 | voxel_size=voxel_size, 110 | alpha=1.0, 111 | num_classes=46), 112 | post_processing=dict( 113 | type='nms', 114 | nms_thr=0.5), 115 | ######## soft nms can generate a little higher result 116 | # post_processing=dict( 117 | # type='soft_nms', 118 | # gaussian_sigma=0.3, 119 | # prune_threshold=1e-2), 120 | positional_encoding=dict( 121 | type='SinePositionalEncoding', 122 | num_feats=128, 123 | normalize=True, 124 | offset=-0.5), 125 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 126 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 127 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 128 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 129 | ), 130 | # model training and testing settings 131 | train_cfg=dict(pts=dict( 132 | grid_size=grid_size, 133 | voxel_size=voxel_size, 134 | point_cloud_range=point_cloud_range, 135 | out_size_factor=bev_stride, 136 | assigner=dict( 137 | type='HungarianAssigner3D', 138 | cls_cost=dict(type='FocalLossCost', weight=2.0), 139 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 140 | iou_cost=dict(type='IoU3DCost', weight=1.2), 141 | pc_range=point_cloud_range)))) 142 | 143 | 144 | dataset_type = 'SUNRGBDDataset_OV' 145 | data_root = 'data/sunrgbd_coda/' 146 | class_names = ('chair', 'table', 'pillow', 'sofa_chair', 'desk', 'bed', 'sofa', 'computer', 'box', 147 | 'lamp', 'garbage_bin', 'cabinet', 'shelf', 'drawer', 'sink', 'night_stand', 'kitchen_counter', 148 | 'paper', 'end_table', 'kitchen_cabinet', 'picture', 'book', 'stool', 'coffee_table', 'bookshelf', 149 | 'painting', 'key_board', 'dresser', 'tv', 'whiteboard', 'cpu', 'toilet', 'file_cabinet', 'bench', 150 | 'ottoman', 'plant', 'monitor', 'printer', 'recycle_bin', 'door', 'fridge', 'towel', 'cup', 'mirror', 151 | 'laptop', 'cloth') 152 | 153 | seen_classes = ('chair', 'table', 'pillow', 'sofa_chair', 'desk', 'bed', 'sofa', 'computer', 'lamp', 'box') 154 | 155 | file_client_args = dict(backend='disk') 156 | 157 | train_pipeline = [ 158 | dict( 159 | type='LoadPointsFromFile', 160 | coord_type='DEPTH', 161 | shift_height=True, 162 | load_dim=6, 163 | use_dim=[0, 1, 2], 164 | file_client_args=file_client_args), 165 | dict(type='LoadAnnotations3D'), 166 | dict( 167 | type='UnifiedRandomFlip3D', 168 | sync_2d=False, 169 | flip_ratio_bev_horizontal=0.5, 170 | ), 171 | dict( 172 | type='UnifiedRotScaleTrans', 173 | rot_range=[-0.523599, 0.523599], 174 | scale_ratio_range=[0.85, 1.15], 175 | shift_height=True), 176 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 177 | # dict(type='PointSample', num_points=20000), 178 | dict(type='PointSample', num_points=200000), 179 | dict(type='DefaultFormatBundle3D', class_names=class_names), 180 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) 181 | ] 182 | test_pipeline = [ 183 | dict( 184 | type='LoadPointsFromFile', 185 | coord_type='DEPTH', 186 | shift_height=True, 187 | load_dim=6, 188 | use_dim=[0, 1, 2], 189 | file_client_args=file_client_args), 190 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 191 | # dict(type='PointSample', num_points=50000), 192 | dict(type='PointSample', num_points=200000), 193 | dict( 194 | type='DefaultFormatBundle3D', 195 | class_names=class_names, 196 | with_label=False), 197 | dict(type='Collect3D', keys=['points']) 198 | ] 199 | 200 | data = dict( 201 | samples_per_gpu=8, 202 | workers_per_gpu=4, 203 | train=dict( 204 | type='RepeatDataset', 205 | times=2, #######5 206 | dataset=dict( 207 | type=dataset_type, 208 | data_root=data_root, 209 | ann_file=data_root + 'sunrgbd_infos_train_pls_ens_10c36c.pkl', 210 | pipeline=train_pipeline, 211 | classes=class_names, 212 | seen_classes=seen_classes, 213 | filter_empty_gt=True, 214 | box_type_3d='Depth', 215 | file_client_args=file_client_args)), 216 | val=dict( 217 | type=dataset_type, 218 | data_root=data_root, 219 | ann_file=data_root + 'sunrgbd_infos_val_withimg.pkl', 220 | pipeline=test_pipeline, 221 | classes=class_names, 222 | seen_classes=seen_classes, 223 | test_mode=True, 224 | box_type_3d='Depth', 225 | file_client_args=file_client_args), 226 | test=dict( 227 | type=dataset_type, 228 | data_root=data_root, 229 | ann_file=data_root + 'sunrgbd_infos_val_withimg.pkl', 230 | pipeline=test_pipeline, 231 | classes=class_names, 232 | seen_classes=seen_classes, 233 | test_mode=True, 234 | box_type_3d='Depth', 235 | file_client_args=file_client_args)) 236 | 237 | evaluation = dict(pipeline=test_pipeline, interval=5) 238 | 239 | 240 | # optimizer 241 | # This schedule is mainly used by models on indoor dataset, 242 | # e.g., VoteNet on SUNRGBD and ScanNet 243 | lr = 2e-5 *2/8 * 40 # max learning rate 244 | optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) 245 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 246 | 247 | 248 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 249 | runner = dict(type='EpochBasedRunner', max_epochs=40) 250 | 251 | # fp16 setting 252 | # fp16 = dict(loss_scale=32.) 253 | find_unused_parameters = True 254 | -------------------------------------------------------------------------------- /projects/configs/ov_uni3detr/ov_uni3detr_sunrgbd_rgb.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | # If point cloud range is changed, the models should also change their point 9 | # cloud range accordingly 10 | voxel_size = [0.02, 0.02, 0.02] 11 | grid_size = [128, 320, 320] 12 | point_cloud_range = [-3.2, -0.2, -2., 3.2, 6.2, 0.56] 13 | 14 | 15 | cam_sweep_num = 1 16 | fp16_enabled = False 17 | bev_stride = 8 18 | sample_num = 15 19 | voxel_shape = [int(((point_cloud_range[3]-point_cloud_range[0])/voxel_size[0])//bev_stride), 20 | int(((point_cloud_range[4]-point_cloud_range[1])/voxel_size[1])//bev_stride), 21 | sample_num] 22 | 23 | 24 | input_modality = dict( 25 | use_lidar=False, 26 | use_camera=True, 27 | use_radar=False, 28 | use_map=False, 29 | use_external=False, 30 | cam_sweep_num=cam_sweep_num) 31 | 32 | model = dict( 33 | type='OV_Uni3DETR', 34 | use_grid_mask=True, 35 | img_backbone=dict( 36 | type='ResNet', 37 | depth=50, 38 | num_stages=4, 39 | out_indices=(0, 1, 2, 3), 40 | frozen_stages=1, 41 | norm_cfg=dict(type='BN', requires_grad=True), 42 | norm_eval=True, 43 | style='pytorch', 44 | dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), 45 | stage_with_dcn=(False, True, True, True) 46 | ), 47 | img_neck=dict( 48 | type='FPN', 49 | in_channels=[256, 512, 1024, 2048], 50 | out_channels=256, 51 | num_outs=5, 52 | ), 53 | depth_head=dict( 54 | type='SimpleDepth', 55 | model=dict( 56 | depth_dim=64, 57 | )), 58 | view_cfg=dict( 59 | num_cams=1, 60 | num_convs=3, 61 | num_points=sample_num, 62 | num_sweeps=cam_sweep_num, 63 | kernel_size=(3,3,3), 64 | keep_sweep_dim=True, 65 | num_feature_levels=4, 66 | embed_dims=256, 67 | pc_range=point_cloud_range, 68 | voxel_shape=voxel_shape, 69 | fp16_enabled=fp16_enabled, 70 | ), 71 | pts_bbox_head=dict( 72 | type='Uni3DETRHeadCLIP', 73 | num_query=300, 74 | zeroshot_path='clip_embed/sunrgbd_clip_a+cname_rn50_manyprompt_46c_coda.npy', 75 | num_classes=46, 76 | in_channels=256, 77 | sync_cls_avg_factor=True, 78 | with_box_refine=True, 79 | as_two_stage=False, 80 | code_size=8, 81 | transformer=dict( 82 | type='Uni3DETRTransformer', 83 | fp16_enabled=fp16_enabled, 84 | decoder=dict( 85 | type='Uni3DETRTransformerDecoder', 86 | num_layers=6, 87 | return_intermediate=True, 88 | transformerlayers=dict( 89 | type='BaseTransformerLayer', 90 | attn_cfgs=[ 91 | dict( 92 | type='MultiheadAttention', 93 | embed_dims=256, 94 | num_heads=8, 95 | dropout=0.1), 96 | dict( 97 | type='UniCrossAtten', 98 | num_points=1, 99 | embed_dims=256, 100 | num_sweeps=cam_sweep_num, 101 | fp16_enabled=fp16_enabled) 102 | ], 103 | ffn_cfgs=dict( 104 | type='FFN', 105 | embed_dims=256, 106 | feedforward_channels=512, 107 | num_fcs=2, 108 | ffn_drop=0.1, 109 | act_cfg=dict(type='ReLU', inplace=True), 110 | ), 111 | norm_cfg=dict(type='LN'), 112 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 113 | 'ffn', 'norm')) 114 | ) 115 | ), 116 | bbox_coder=dict( 117 | type='NMSFreeCoder', 118 | post_center_range=point_cloud_range, 119 | pc_range=point_cloud_range, 120 | max_num=1000, 121 | voxel_size=voxel_size, 122 | alpha=1.0, 123 | num_classes=46 124 | ), 125 | post_processing=dict( 126 | type='nms', 127 | nms_thr=0.5), 128 | positional_encoding=dict( 129 | type='SinePositionalEncoding', 130 | num_feats=128, 131 | normalize=True, 132 | offset=-0.5), 133 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 134 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 135 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 136 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), 137 | # model training and testing settings 138 | train_cfg=dict(pts=dict( 139 | grid_size=grid_size, 140 | voxel_size=voxel_size, 141 | point_cloud_range=point_cloud_range, 142 | out_size_factor=bev_stride, 143 | assigner=dict( 144 | type='HungarianAssigner3D', 145 | cls_cost=dict(type='FocalLossCost', weight=2.0), 146 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 147 | iou_cost=dict(type='IoU3DCost', weight=1.2), # Fake cost. This is just to make it compatible with DETR head. 148 | pc_range=point_cloud_range)))) 149 | 150 | dataset_type = 'SUNRGBDDataset' 151 | data_root = 'data/sunrgbd_coda/' 152 | 153 | # img_norm_cfg = dict( 154 | # mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) 155 | img_norm_cfg = dict( 156 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 157 | 158 | class_names = ('chair', 'table', 'pillow', 'sofa_chair', 'desk', 'bed', 'sofa', 'computer', 'box', 159 | 'lamp', 'garbage_bin', 'cabinet', 'shelf', 'drawer', 'sink', 'night_stand', 'kitchen_counter', 160 | 'paper', 'end_table', 'kitchen_cabinet', 'picture', 'book', 'stool', 'coffee_table', 'bookshelf', 161 | 'painting', 'key_board', 'dresser', 'tv', 'whiteboard', 'cpu', 'toilet', 'file_cabinet', 'bench', 162 | 'ottoman', 'plant', 'monitor', 'printer', 'recycle_bin', 'door', 'fridge', 'towel', 'cup', 'mirror', 163 | 'laptop', 'cloth') 164 | 165 | 166 | file_client_args = dict(backend='disk') 167 | 168 | 169 | train_pipeline = [ 170 | dict(type='LoadMultiViewMultiSweepImageFromFilesIndoor', sweep_num=cam_sweep_num, to_float32=True), 171 | dict(type='PhotoMetricDistortionMultiViewImage'), 172 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 173 | dict( 174 | type='UnifiedRotScaleTrans', 175 | rot_range=[-0.3925, 0.3925], 176 | scale_ratio_range=[0.95, 1.05], 177 | ), 178 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 179 | dict(type='PadMultiViewImage', size_divisor=32), 180 | dict(type='DefaultFormatBundle3D', class_names=class_names), 181 | dict(type='CollectUnified3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) 182 | ] 183 | test_pipeline = [ 184 | dict(type='LoadMultiViewMultiSweepImageFromFilesIndoor', sweep_num=cam_sweep_num, to_float32=True), 185 | dict(type='NormalizeMultiviewImage', **img_norm_cfg), 186 | dict(type='PadMultiViewImage', size_divisor=32), 187 | dict(type='DefaultFormatBundle3D', class_names=class_names), 188 | dict(type='CollectUnified3D', keys=['img']) 189 | ] 190 | 191 | 192 | 193 | data = dict( 194 | samples_per_gpu=2, 195 | workers_per_gpu=4, 196 | train=dict( 197 | type='RepeatDataset', 198 | times=2, #######5 199 | dataset=dict( 200 | type=dataset_type, 201 | data_root=data_root, 202 | ann_file = data_root + 'sunrgbd_infos_train_pls_ens_10c36c.pkl', 203 | pipeline=train_pipeline, 204 | classes=class_names, 205 | filter_empty_gt=True, 206 | box_type_3d='Depth', 207 | file_client_args=file_client_args)), 208 | val=dict( 209 | type=dataset_type, 210 | data_root=data_root, 211 | ann_file=data_root + 'sunrgbd_infos_val_withimg.pkl', 212 | pipeline=test_pipeline, 213 | classes=class_names, 214 | test_mode=True, 215 | box_type_3d='Depth', 216 | file_client_args=file_client_args), 217 | test=dict( 218 | type=dataset_type, 219 | data_root=data_root, 220 | ann_file=data_root + 'sunrgbd_infos_val_withimg.pkl', 221 | pipeline=test_pipeline, 222 | classes=class_names, 223 | test_mode=True, 224 | box_type_3d='Depth', 225 | file_client_args=file_client_args)) 226 | 227 | evaluation = dict(pipeline=test_pipeline, interval=5) 228 | 229 | 230 | # optimizer 231 | # This schedule is mainly used by models on indoor dataset, 232 | # e.g., VoteNet on SUNRGBD and ScanNet 233 | optimizer = dict( 234 | type='AdamW', 235 | lr=1.75e-4, 236 | # lr=2e-4, 237 | paramwise_cfg=dict( 238 | custom_keys={ 239 | 'img_backbone': dict(lr_mult=0.1), 240 | }), 241 | weight_decay=0.01) 242 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 243 | 244 | 245 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 246 | 247 | # runtime settings 248 | runner = dict(type='EpochBasedRunner', max_epochs=40) 249 | 250 | # fp16 setting 251 | # fp16 = dict(loss_scale=32.) 252 | load_from = 'faster_rcnn_r50_caffe_fpn_1x_coco_dcnv2_c.pth' 253 | 254 | find_unused_parameters = True 255 | -------------------------------------------------------------------------------- /projects/configs/uni3detr/uni3detr_kitti_3classes.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | # If point cloud range is changed, the models should also change their point 9 | # cloud range accordingly 10 | point_cloud_range = [0, -40, -3, 70.4, 40, 1] 11 | voxel_size = [0.05, 0.05, 0.1] 12 | fp16_enabled = True 13 | bev_stride = 4 14 | sample_num = 5 15 | # For nuScenes we usually do 10-class detection 16 | class_names = ['Pedestrian', 'Cyclist', 'Car'] 17 | 18 | input_modality = dict( 19 | use_lidar=True, 20 | use_camera=False, 21 | use_radar=False, 22 | use_map=False, 23 | use_external=False) 24 | 25 | use_dab = True 26 | 27 | model = dict( 28 | type='Uni3DETR', 29 | pts_voxel_layer=dict( 30 | max_num_points=5, voxel_size=voxel_size, max_voxels=(16000, 40000), 31 | point_cloud_range=point_cloud_range), 32 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=4), 33 | pts_middle_encoder=dict( 34 | type='SparseEncoderHD', 35 | in_channels=4, 36 | sparse_shape=[41, 1600, 1408], 37 | output_channels=256, 38 | order=('conv', 'norm', 'act'), 39 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 40 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 41 | block_type='basicblock', 42 | fp16_enabled=False), # not enable FP16 here 43 | pts_backbone=dict( 44 | type='SECOND3D', 45 | in_channels=[256, 256, 256], 46 | out_channels=[128, 256, 512], 47 | layer_nums=[5, 5, 5], 48 | layer_strides=[1, 2, 4], 49 | is_cascade=False, 50 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 51 | conv_cfg=dict(type='Conv3d', kernel=(1,3,3), bias=False)), 52 | pts_neck=dict( 53 | type='SECOND3DFPN', 54 | in_channels=[128, 256, 512], 55 | out_channels=[256, 256, 256], 56 | upsample_strides=[1, 2, 4], 57 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 58 | upsample_cfg=dict(type='deconv3d', bias=False), 59 | extra_conv=dict(type='Conv3d', num_conv=3, bias=False), 60 | use_conv_for_no_stride=True), 61 | pts_bbox_head=dict( 62 | type='Uni3DETRHead', 63 | # transformer_cfg 64 | num_query=300, 65 | num_classes=3, 66 | in_channels=256, 67 | sync_cls_avg_factor=True, 68 | with_box_refine=True, 69 | as_two_stage=False, 70 | code_size=8, 71 | gt_repeattimes=5, 72 | transformer=dict( 73 | type='Uni3DETRTransformer', 74 | fp16_enabled=fp16_enabled, 75 | decoder=dict( 76 | type='Uni3DETRTransformerDecoder', 77 | num_layers=9, 78 | return_intermediate=True, 79 | transformerlayers=dict( 80 | type='BaseTransformerLayer', 81 | attn_cfgs=[ 82 | dict( 83 | type='MultiheadAttention', 84 | embed_dims=256, 85 | num_heads=8, 86 | dropout=0.1), 87 | dict( 88 | type='UniCrossAtten', 89 | num_points=1, 90 | embed_dims=256, 91 | num_sweeps=1, 92 | fp16_enabled=fp16_enabled) 93 | ], 94 | ffn_cfgs=dict( 95 | type='FFN', 96 | embed_dims=256, 97 | feedforward_channels=512, 98 | num_fcs=2, 99 | ffn_drop=0.1, 100 | act_cfg=dict(type='ReLU', inplace=True), 101 | ), 102 | norm_cfg=dict(type='LN'), 103 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 104 | 'ffn', 'norm')) 105 | ) 106 | ), 107 | bbox_coder=dict( 108 | type='NMSFreeCoder', 109 | post_center_range=[0, -40, -3, 70.4, 40, 1], 110 | pc_range=point_cloud_range, 111 | max_num=150, 112 | alpha=0.2, 113 | voxel_size=voxel_size, 114 | num_classes=3), 115 | post_processing=dict( 116 | type='box_merging', 117 | score_thr=[0., 0.3, 0.65]), 118 | positional_encoding=dict( 119 | type='SinePositionalEncoding', 120 | num_feats=128, 121 | normalize=True, 122 | offset=-0.5), 123 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 124 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 125 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 126 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 127 | ), 128 | # model training and testing settings 129 | train_cfg=dict(pts=dict( 130 | grid_size=[1408, 1600, 40], 131 | voxel_size=voxel_size, 132 | point_cloud_range=point_cloud_range, 133 | out_size_factor=bev_stride, 134 | assigner=dict( 135 | type='HungarianAssigner3D', 136 | cls_cost=dict(type='FocalLossCost', weight=2.0), 137 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 138 | iou_cost=dict(type='IoU3DCost', weight=1.2), 139 | pc_range=point_cloud_range)))) 140 | 141 | 142 | # dataset settings 143 | dataset_type = 'KittiDataset' 144 | data_root = 'data/kitti/' 145 | class_names = ['Pedestrian', 'Cyclist', 'Car'] 146 | point_cloud_range = [0, -40, -3, 70.4, 40, 1] 147 | input_modality = dict(use_lidar=True, use_camera=False) 148 | 149 | db_sampler = dict( 150 | data_root=data_root, 151 | info_path=data_root + 'kitti_dbinfos_train.pkl', 152 | rate=1.0, 153 | prepare=dict( 154 | filter_by_difficulty=[-1], 155 | filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), 156 | classes=class_names, 157 | sample_groups=dict(Car=20, Pedestrian=6, Cyclist=6)) 158 | 159 | 160 | file_client_args = dict(backend='disk') 161 | # Uncomment the following if use ceph or other file clients. 162 | # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient 163 | # for more details. 164 | # file_client_args = dict( 165 | # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) 166 | 167 | train_pipeline = [ 168 | dict( 169 | type='LoadPointsFromFile', 170 | coord_type='LIDAR', 171 | load_dim=4, 172 | use_dim=4, 173 | file_client_args=file_client_args), 174 | dict( 175 | type='LoadAnnotations3D', 176 | with_bbox_3d=True, 177 | with_label_3d=True, 178 | file_client_args=file_client_args), 179 | dict(type='ObjectSample', db_sampler=db_sampler), 180 | dict( 181 | type='ObjectNoise', 182 | num_try=100, 183 | translation_std=[1.0, 1.0, 0.5], 184 | global_rot_range=[0.0, 0.0], 185 | rot_range=[-0.78539816, 0.78539816]), 186 | dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), 187 | dict( 188 | type='GlobalRotScaleTrans', 189 | rot_range=[-0.78539816, 0.78539816], 190 | scale_ratio_range=[0.95, 1.05]), 191 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 192 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 193 | dict(type='ObjectNameFilter', classes=class_names), 194 | dict(type='PointShuffle'), 195 | dict(type='PointSample', num_points=18000), 196 | dict(type='DefaultFormatBundle3D', class_names=class_names), 197 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) 198 | ] 199 | test_pipeline = [ 200 | dict( 201 | type='LoadPointsFromFile', 202 | coord_type='LIDAR', 203 | load_dim=4, 204 | use_dim=4, 205 | file_client_args=file_client_args), 206 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 207 | dict( 208 | type='DefaultFormatBundle3D', 209 | class_names=class_names, 210 | with_label=False), 211 | dict(type='Collect3D', keys=['points']) 212 | # dict( 213 | # type='MultiScaleFlipAug3D', 214 | # img_scale=(1333, 800), 215 | # pts_scale_ratio=1, 216 | # flip=False, 217 | # transforms=[ 218 | # dict( 219 | # type='GlobalRotScaleTrans', 220 | # rot_range=[0, 0], 221 | # scale_ratio_range=[1., 1.], 222 | # translation_std=[0, 0, 0]), 223 | # dict(type='RandomFlip3D'), 224 | # dict( 225 | # type='PointsRangeFilter', point_cloud_range=point_cloud_range), 226 | # dict( 227 | # type='DefaultFormatBundle3D', 228 | # class_names=class_names, 229 | # with_label=False), 230 | # dict(type='Collect3D', keys=['points']) 231 | # ]) 232 | ] 233 | 234 | 235 | data = dict( 236 | samples_per_gpu=1, 237 | workers_per_gpu=2, 238 | train=dict( 239 | type='RepeatDataset', 240 | times=2, 241 | dataset=dict( 242 | type=dataset_type, 243 | data_root=data_root, 244 | ann_file=data_root + 'kitti_infos_train_van.pkl', 245 | split='training', 246 | pts_prefix='velodyne_reduced', 247 | pipeline=train_pipeline, 248 | modality=input_modality, 249 | classes=class_names, 250 | test_mode=False, 251 | # we use box_type_3d='LiDAR' in kitti and nuscenes dataset 252 | # and box_type_3d='Depth' in sunrgbd and scannet dataset. 253 | box_type_3d='LiDAR')), 254 | val=dict( 255 | type=dataset_type, 256 | data_root=data_root, 257 | ann_file=data_root + 'kitti_infos_val.pkl', 258 | split='training', 259 | pts_prefix='velodyne_reduced', 260 | pipeline=test_pipeline, 261 | modality=input_modality, 262 | classes=class_names, 263 | test_mode=True, 264 | box_type_3d='LiDAR'), 265 | test=dict( 266 | type=dataset_type, 267 | data_root=data_root, 268 | ann_file=data_root + 'kitti_infos_val.pkl', 269 | split='training', 270 | pts_prefix='velodyne_reduced', 271 | pipeline=test_pipeline, 272 | modality=input_modality, 273 | classes=class_names, 274 | test_mode=True, 275 | box_type_3d='LiDAR')) 276 | 277 | evaluation = dict(interval=1, pipeline=test_pipeline) 278 | 279 | 280 | checkpoint_config = dict(interval=1) 281 | 282 | lr = 2e-5 *3/8 * 18 /2 # max learning rate 283 | optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) 284 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 285 | 286 | 287 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 288 | runner = dict(type='EpochBasedRunner', max_epochs=40) 289 | 290 | find_unused_parameters = True 291 | workflow = [('train', 1)] 292 | gpu_ids = range(0, 1) 293 | dist_params = dict(backend='nccl') 294 | log_level = 'INFO' 295 | 296 | # fp16 setting 297 | fp16 = dict(loss_scale=32.) 298 | -------------------------------------------------------------------------------- /projects/configs/uni3detr/uni3detr_scannet.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | 9 | voxel_size = [0.02, 0.02, 0.02] 10 | grid_size = [128, 640, 640] 11 | 12 | point_cloud_range = [-6.4, -6.4, -0.1, 6.4, 6.4, 2.46] 13 | 14 | 15 | fp16_enabled = True 16 | bev_stride = 4 17 | sample_num = 5 18 | 19 | input_modality = dict( 20 | use_lidar=True, 21 | use_camera=False, 22 | use_radar=False, 23 | use_map=False, 24 | use_external=False) 25 | 26 | model = dict( 27 | type='Uni3DETR', 28 | pts_voxel_layer=dict( 29 | max_num_points=5, voxel_size=voxel_size, max_voxels=(16000, 40000), ###16000 30 | point_cloud_range=point_cloud_range), 31 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=4), 32 | pts_middle_encoder=dict( 33 | type='SparseEncoderHD', 34 | in_channels=4, 35 | sparse_shape=grid_size, 36 | output_channels=256, 37 | order=('conv', 'norm', 'act'), 38 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 39 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 40 | block_type='basicblock', 41 | fp16_enabled=False), # not enable FP16 here 42 | pts_backbone=dict( 43 | type='SECOND3D', 44 | in_channels=[256, 256, 256], 45 | out_channels=[128, 256, 512], 46 | layer_nums=[5, 5, 5], 47 | layer_strides=[1, 2, 4], 48 | is_cascade=False, 49 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 50 | conv_cfg=dict(type='Conv3d', kernel=(1,3,3), bias=False)), 51 | pts_neck=dict( 52 | type='SECOND3DFPN', 53 | in_channels=[128, 256, 512], 54 | out_channels=[256, 256, 256], 55 | upsample_strides=[1, 2, 4], 56 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 57 | upsample_cfg=dict(type='deconv3d', bias=False), 58 | extra_conv=dict(type='Conv3d', num_conv=3, bias=False), 59 | use_conv_for_no_stride=True), 60 | pts_bbox_head=dict( 61 | type='Uni3DETRHead', 62 | # transformer_cfg 63 | num_query=300, 64 | num_classes=18, 65 | in_channels=256, 66 | sync_cls_avg_factor=True, 67 | with_box_refine=True, 68 | as_two_stage=False, 69 | code_size=8, 70 | with_nms=True, 71 | transformer=dict( 72 | type='Uni3DETRTransformer', 73 | fp16_enabled=fp16_enabled, 74 | decoder=dict( 75 | type='Uni3DETRTransformerDecoder', 76 | num_layers=3, 77 | return_intermediate=True, 78 | transformerlayers=dict( 79 | type='BaseTransformerLayer', 80 | attn_cfgs=[ 81 | dict( 82 | type='MultiheadAttention', 83 | embed_dims=256, 84 | num_heads=8, 85 | dropout=0.1), 86 | dict( 87 | type='UniCrossAtten', 88 | num_points=1, 89 | embed_dims=256, 90 | num_sweeps=1, 91 | fp16_enabled=fp16_enabled), 92 | ], 93 | ffn_cfgs=dict( 94 | type='FFN', 95 | embed_dims=256, 96 | feedforward_channels=512, 97 | num_fcs=2, 98 | ffn_drop=0.1, ##0.1 99 | act_cfg=dict(type='ReLU', inplace=True), 100 | ), 101 | norm_cfg=dict(type='LN'), 102 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')) 103 | # operation_order=('cross_attn', 'norm', 'self_attn', 'norm', 'ffn', 'norm')) 104 | ) 105 | ), 106 | bbox_coder=dict( 107 | type='NMSFreeCoder', 108 | post_center_range=point_cloud_range, 109 | pc_range=point_cloud_range, 110 | # max_num=1000, 111 | max_num=5000, 112 | voxel_size=voxel_size, 113 | num_classes=18), 114 | positional_encoding=dict( 115 | type='SinePositionalEncoding', 116 | num_feats=128, 117 | normalize=True, 118 | offset=-0.5), 119 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 120 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 121 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 122 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 123 | ), 124 | # model training and testing settings 125 | train_cfg=dict(pts=dict( 126 | grid_size=grid_size, 127 | voxel_size=voxel_size, 128 | point_cloud_range=point_cloud_range, 129 | out_size_factor=bev_stride, 130 | assigner=dict( 131 | type='HungarianAssigner3D', 132 | cls_cost=dict(type='FocalLossCost', weight=2.0), 133 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 134 | iou_cost=dict(type='IoU3DCost', weight=1.2), 135 | pc_range=point_cloud_range)))) 136 | 137 | 138 | dataset_type = 'ScanNetDataset' 139 | data_root = './data/scannet/' 140 | class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 141 | 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 142 | 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 143 | 'garbagebin') 144 | 145 | 146 | train_pipeline = [ 147 | dict( 148 | type='LoadPointsFromFile', 149 | coord_type='DEPTH', 150 | shift_height=False, 151 | load_dim=3, 152 | use_dim=[0, 1, 2]), 153 | dict(type='LoadAnnotations3D'), 154 | dict( 155 | type='RandomFlip3D', 156 | sync_2d=False, 157 | flip_ratio_bev_horizontal=0.5, 158 | flip_ratio_bev_vertical=0.5), 159 | dict( 160 | type='GlobalRotScaleTrans', 161 | rot_range=[-0.087266, 0.087266], 162 | scale_ratio_range=[.9, 1.1], 163 | translation_std=[.1, .1, .1], 164 | shift_height=False), 165 | dict(type='PointSample', num_points=200000), 166 | dict(type='DefaultFormatBundle3D', class_names=class_names), 167 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) 168 | ] 169 | 170 | test_pipeline = [ 171 | dict( 172 | type='LoadPointsFromFile', 173 | coord_type='DEPTH', 174 | shift_height=False, 175 | load_dim=3, 176 | use_dim=[0, 1, 2]), 177 | dict( 178 | type='DefaultFormatBundle3D', 179 | class_names=class_names, 180 | with_label=False), 181 | dict(type='Collect3D', keys=['points']) 182 | ] 183 | 184 | data = dict( 185 | samples_per_gpu=3, ##### 3 186 | workers_per_gpu=4, 187 | train=dict( 188 | type='RepeatDataset', 189 | times=6, 190 | dataset=dict( 191 | type=dataset_type, 192 | data_root=data_root, 193 | ann_file=data_root + 'scannet_infos_train.pkl', 194 | pipeline=train_pipeline, 195 | filter_empty_gt=True, 196 | classes=class_names, 197 | box_type_3d='Depth') 198 | ), 199 | val=dict( 200 | type=dataset_type, 201 | data_root=data_root, 202 | ann_file=data_root + 'scannet_infos_val.pkl', 203 | pipeline=test_pipeline, 204 | classes=class_names, 205 | test_mode=True, 206 | box_type_3d='Depth'), 207 | test=dict( 208 | type=dataset_type, 209 | data_root=data_root, 210 | ann_file=data_root + 'scannet_infos_val.pkl', 211 | pipeline=test_pipeline, 212 | classes=class_names, 213 | test_mode=True, 214 | box_type_3d='Depth')) 215 | 216 | evaluation = dict(pipeline=test_pipeline, interval=5) 217 | 218 | 219 | # optimizer 220 | # This schedule is mainly used by models on indoor dataset, 221 | # e.g., VoteNet on SUNRGBD and ScanNet 222 | lr = 2e-5 *2/8 * 20 * 4/6 *6/8 *1.5 *8/6###########40 # max learning rate 223 | optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) 224 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 225 | 226 | 227 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 228 | runner = dict(type='EpochBasedRunner', max_epochs=40) ###40 229 | 230 | # fp16 setting 231 | fp16 = dict(loss_scale=32.) 232 | find_unused_parameters = True 233 | -------------------------------------------------------------------------------- /projects/configs/uni3detr/uni3detr_scannet_large.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | 9 | voxel_size = [0.02, 0.02, 0.02] 10 | grid_size = [128, 640, 640] 11 | 12 | point_cloud_range = [-6.4, -6.4, -0.1, 6.4, 6.4, 2.46] 13 | 14 | 15 | fp16_enabled = True 16 | bev_stride = 4 17 | sample_num = 5 18 | 19 | input_modality = dict( 20 | use_lidar=True, 21 | use_camera=False, 22 | use_radar=False, 23 | use_map=False, 24 | use_external=False) 25 | 26 | model = dict( 27 | type='Uni3DETR', 28 | dynamic_voxelization=True, 29 | pts_voxel_layer=dict( 30 | max_num_points=-1, point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(-1, -1)), 31 | pts_voxel_encoder=dict(type='DynamicSimpleVFE', voxel_size=voxel_size, point_cloud_range=point_cloud_range), 32 | pts_middle_encoder=dict( 33 | type='SparseEncoderHD', 34 | in_channels=4, 35 | sparse_shape=grid_size, 36 | base_channels=32, 37 | output_channels=512, 38 | order=('conv', 'norm', 'act'), 39 | encoder_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256)), 40 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 41 | block_type='basicblock', 42 | fp16_enabled=False), # not enable FP16 here 43 | pts_backbone=dict( 44 | type='SECOND3D', 45 | in_channels=[512, 512, 512], 46 | out_channels=[128, 256, 512], 47 | layer_nums=[5, 5, 5], 48 | layer_strides=[1, 2, 4], 49 | is_cascade=False, 50 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 51 | conv_cfg=dict(type='Conv3d', kernel=(1,3,3), bias=False)), 52 | pts_neck=dict( 53 | type='SECOND3DFPN', 54 | in_channels=[128, 256, 512], 55 | out_channels=[256, 256, 256], 56 | upsample_strides=[1, 2, 4], 57 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 58 | upsample_cfg=dict(type='deconv3d', bias=False), 59 | extra_conv=dict(type='Conv3d', num_conv=3, bias=False), 60 | use_conv_for_no_stride=True), 61 | pts_bbox_head=dict( 62 | type='Uni3DETRHead', 63 | # transformer_cfg 64 | num_query=300, 65 | num_classes=18, 66 | in_channels=256, 67 | sync_cls_avg_factor=True, 68 | with_box_refine=True, 69 | as_two_stage=False, 70 | code_size=8, 71 | transformer=dict( 72 | type='Uni3DETRTransformer', 73 | fp16_enabled=fp16_enabled, 74 | decoder=dict( 75 | type='Uni3DETRTransformerDecoder', 76 | num_layers=3, 77 | return_intermediate=True, 78 | transformerlayers=dict( 79 | type='BaseTransformerLayer', 80 | attn_cfgs=[ 81 | dict( 82 | type='MultiheadAttention', 83 | embed_dims=256, 84 | num_heads=8, 85 | dropout=0.1), 86 | dict( 87 | type='UniCrossAtten', 88 | num_points=1, 89 | embed_dims=256, 90 | num_sweeps=1, 91 | fp16_enabled=fp16_enabled), 92 | ], 93 | ffn_cfgs=dict( 94 | type='FFN', 95 | embed_dims=256, 96 | feedforward_channels=512, 97 | num_fcs=2, 98 | ffn_drop=0.1, ##0.1 99 | act_cfg=dict(type='ReLU', inplace=True), 100 | ), 101 | norm_cfg=dict(type='LN'), 102 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')) 103 | # operation_order=('cross_attn', 'norm', 'self_attn', 'norm', 'ffn', 'norm')) 104 | ) 105 | ), 106 | bbox_coder=dict( 107 | type='NMSFreeCoder', 108 | post_center_range=point_cloud_range, 109 | pc_range=point_cloud_range, 110 | # max_num=1000, 111 | max_num=5000, 112 | alpha=1.0, 113 | voxel_size=voxel_size, 114 | num_classes=18), 115 | post_processing=dict( 116 | type='nms', 117 | nms_thr=0.5), 118 | ######## soft nms can generate a little higher result 119 | # post_processing=dict( 120 | # type='soft_nms', 121 | # gaussian_sigma=0.3, 122 | # prune_threshold=1e-2), 123 | positional_encoding=dict( 124 | type='SinePositionalEncoding', 125 | num_feats=128, 126 | normalize=True, 127 | offset=-0.5), 128 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 129 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 130 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 131 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 132 | ), 133 | # model training and testing settings 134 | train_cfg=dict(pts=dict( 135 | grid_size=grid_size, 136 | voxel_size=voxel_size, 137 | point_cloud_range=point_cloud_range, 138 | out_size_factor=bev_stride, 139 | assigner=dict( 140 | type='HungarianAssigner3D', 141 | cls_cost=dict(type='FocalLossCost', weight=2.0), 142 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 143 | iou_cost=dict(type='IoU3DCost', weight=1.2), 144 | pc_range=point_cloud_range)))) 145 | 146 | 147 | dataset_type = 'ScanNetDataset' 148 | data_root = './data/scannet/' 149 | class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 150 | 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 151 | 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 152 | 'garbagebin') 153 | 154 | 155 | train_pipeline = [ 156 | dict( 157 | type='LoadPointsFromFile', 158 | coord_type='DEPTH', 159 | shift_height=False, 160 | load_dim=3, 161 | use_dim=[0, 1, 2]), 162 | dict(type='LoadAnnotations3D'), 163 | dict( 164 | type='RandomFlip3D', 165 | sync_2d=False, 166 | flip_ratio_bev_horizontal=0.5, 167 | flip_ratio_bev_vertical=0.5), 168 | dict( 169 | type='GlobalRotScaleTrans', 170 | rot_range=[-0.087266, 0.087266], 171 | scale_ratio_range=[.9, 1.1], 172 | translation_std=[.1, .1, .1], 173 | shift_height=False), 174 | dict(type='PointSample', num_points=200000), 175 | dict(type='DefaultFormatBundle3D', class_names=class_names), 176 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) 177 | ] 178 | 179 | test_pipeline = [ 180 | dict( 181 | type='LoadPointsFromFile', 182 | coord_type='DEPTH', 183 | shift_height=False, 184 | load_dim=3, 185 | use_dim=[0, 1, 2]), 186 | dict( 187 | type='DefaultFormatBundle3D', 188 | class_names=class_names, 189 | with_label=False), 190 | dict(type='Collect3D', keys=['points']) 191 | ] 192 | 193 | data = dict( 194 | samples_per_gpu=3, ##### 3 195 | workers_per_gpu=4, 196 | train=dict( 197 | type='RepeatDataset', 198 | times=6, 199 | dataset=dict( 200 | type=dataset_type, 201 | data_root=data_root, 202 | ann_file=data_root + 'scannet_infos_train.pkl', 203 | pipeline=train_pipeline, 204 | filter_empty_gt=True, 205 | classes=class_names, 206 | box_type_3d='Depth') 207 | ), 208 | val=dict( 209 | type=dataset_type, 210 | data_root=data_root, 211 | ann_file=data_root + 'scannet_infos_val.pkl', 212 | pipeline=test_pipeline, 213 | classes=class_names, 214 | test_mode=True, 215 | box_type_3d='Depth'), 216 | test=dict( 217 | type=dataset_type, 218 | data_root=data_root, 219 | ann_file=data_root + 'scannet_infos_val.pkl', 220 | pipeline=test_pipeline, 221 | classes=class_names, 222 | test_mode=True, 223 | box_type_3d='Depth')) 224 | 225 | evaluation = dict(pipeline=test_pipeline, interval=5) 226 | 227 | 228 | # optimizer 229 | # This schedule is mainly used by models on indoor dataset, 230 | # e.g., VoteNet on SUNRGBD and ScanNet 231 | lr = 2e-5 *2/8 * 20 * 4/6 *6/8 *1.5 *8/6###########40 # max learning rate 232 | optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) 233 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 234 | 235 | 236 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 237 | runner = dict(type='EpochBasedRunner', max_epochs=40) ###40 238 | 239 | # fp16 setting 240 | fp16 = dict(loss_scale=32.) 241 | find_unused_parameters = True 242 | -------------------------------------------------------------------------------- /projects/configs/uni3detr/uni3detr_sunrgbd.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '../../../configs/_base_/default_runtime.py' 3 | ] 4 | 5 | plugin=True 6 | plugin_dir='projects/mmdet3d_plugin/' 7 | 8 | # If point cloud range is changed, the models should also change their point 9 | # cloud range accordingly 10 | voxel_size = [0.02, 0.02, 0.02] 11 | grid_size = [128, 320, 320] 12 | point_cloud_range = [-3.2, -0.2, -2., 3.2, 6.2, 0.56] 13 | 14 | fp16_enabled = True 15 | bev_stride = 4 16 | sample_num = 5 17 | 18 | 19 | input_modality = dict( 20 | use_lidar=True, 21 | use_camera=False, 22 | use_radar=False, 23 | use_map=False, 24 | use_external=False) 25 | 26 | model = dict( 27 | type='Uni3DETR', 28 | pts_voxel_layer=dict( 29 | max_num_points=5, voxel_size=voxel_size, max_voxels=(16000, 40000), 30 | point_cloud_range=point_cloud_range), 31 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=4), 32 | pts_middle_encoder=dict( 33 | type='SparseEncoderHD', 34 | in_channels=4, 35 | sparse_shape=grid_size, 36 | output_channels=256, 37 | order=('conv', 'norm', 'act'), 38 | encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), 39 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 40 | block_type='basicblock', 41 | fp16_enabled=False), # not enable FP16 here 42 | pts_backbone=dict( 43 | type='SECOND3D', 44 | in_channels=[256, 256, 256], 45 | out_channels=[128, 256, 512], 46 | layer_nums=[5, 5, 5], 47 | layer_strides=[1, 2, 4], 48 | is_cascade=False, 49 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 50 | conv_cfg=dict(type='Conv3d', kernel=(1,3,3), bias=False)), 51 | pts_neck=dict( 52 | type='SECOND3DFPN', 53 | in_channels=[128, 256, 512], 54 | out_channels=[256, 256, 256], 55 | upsample_strides=[1, 2, 4], 56 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 57 | upsample_cfg=dict(type='deconv3d', bias=False), 58 | extra_conv=dict(type='Conv3d', num_conv=3, bias=False), 59 | use_conv_for_no_stride=True), 60 | pts_bbox_head=dict( 61 | type='Uni3DETRHead', 62 | num_query=300, 63 | num_classes=10, 64 | in_channels=256, 65 | sync_cls_avg_factor=True, 66 | with_box_refine=True, 67 | as_two_stage=False, 68 | code_size=8, 69 | transformer=dict( 70 | type='Uni3DETRTransformer', 71 | fp16_enabled=fp16_enabled, 72 | decoder=dict( 73 | type='Uni3DETRTransformerDecoder', 74 | num_layers=3, 75 | return_intermediate=True, 76 | transformerlayers=dict( 77 | type='BaseTransformerLayer', 78 | attn_cfgs=[ 79 | dict( 80 | type='MultiheadAttention', 81 | embed_dims=256, 82 | num_heads=8, 83 | dropout=0.1), 84 | dict( 85 | type='UniCrossAtten', 86 | num_points=1, 87 | embed_dims=256, 88 | num_sweeps=1, 89 | fp16_enabled=fp16_enabled), 90 | ], 91 | ffn_cfgs=dict( 92 | type='FFN', 93 | embed_dims=256, 94 | feedforward_channels=512, 95 | num_fcs=2, 96 | ffn_drop=0.1, 97 | act_cfg=dict(type='ReLU', inplace=True), 98 | ), 99 | norm_cfg=dict(type='LN'), 100 | operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')) 101 | ) 102 | ), 103 | bbox_coder=dict( 104 | type='NMSFreeCoder', 105 | post_center_range=point_cloud_range, 106 | pc_range=point_cloud_range, 107 | max_num=1000, 108 | voxel_size=voxel_size, 109 | alpha=1.0, 110 | num_classes=10), 111 | post_processing=dict( 112 | type='nms', 113 | nms_thr=0.5), 114 | ######## soft nms can generate a little higher result 115 | # post_processing=dict( 116 | # type='soft_nms', 117 | # gaussian_sigma=0.3, 118 | # prune_threshold=1e-2), 119 | positional_encoding=dict( 120 | type='SinePositionalEncoding', 121 | num_feats=128, 122 | normalize=True, 123 | offset=-0.5), 124 | loss_cls=dict(type='SoftFocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.5), 125 | loss_bbox=dict(type='L1Loss', loss_weight=0.25), 126 | loss_iou=dict(type='IoU3DLoss', loss_weight=1.2), 127 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] 128 | ), 129 | # model training and testing settings 130 | train_cfg=dict(pts=dict( 131 | grid_size=grid_size, 132 | voxel_size=voxel_size, 133 | point_cloud_range=point_cloud_range, 134 | out_size_factor=bev_stride, 135 | assigner=dict( 136 | type='HungarianAssigner3D', 137 | cls_cost=dict(type='FocalLossCost', weight=2.0), 138 | reg_cost=dict(type='BBox3DL1Cost', weight=0.25), 139 | iou_cost=dict(type='IoU3DCost', weight=1.2), 140 | pc_range=point_cloud_range)))) 141 | 142 | 143 | dataset_type = 'SUNRGBDDataset' 144 | data_root = 'data/sunrgbd/' 145 | class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 146 | 'night_stand', 'bookshelf', 'bathtub') 147 | 148 | file_client_args = dict(backend='disk') 149 | 150 | train_pipeline = [ 151 | dict( 152 | type='LoadPointsFromFile', 153 | coord_type='DEPTH', 154 | shift_height=True, 155 | load_dim=6, 156 | use_dim=[0, 1, 2], 157 | file_client_args=file_client_args), 158 | dict(type='LoadAnnotations3D', file_client_args=file_client_args), 159 | dict( 160 | type='RandomFlip3D', 161 | sync_2d=False, 162 | flip_ratio_bev_horizontal=0.5, 163 | ), 164 | dict( 165 | type='GlobalRotScaleTrans', 166 | rot_range=[-0.523599, 0.523599], 167 | scale_ratio_range=[0.85, 1.15], 168 | shift_height=True), 169 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 170 | # dict(type='PointSample', num_points=20000), 171 | dict(type='PointSample', num_points=100000), 172 | dict(type='DefaultFormatBundle3D', class_names=class_names), 173 | dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) 174 | ] 175 | test_pipeline = [ 176 | dict( 177 | type='LoadPointsFromFile', 178 | coord_type='DEPTH', 179 | shift_height=True, 180 | load_dim=6, 181 | use_dim=[0, 1, 2], 182 | file_client_args=file_client_args), 183 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 184 | # dict(type='PointSample', num_points=50000), 185 | dict(type='PointSample', num_points=100000), 186 | dict( 187 | type='DefaultFormatBundle3D', 188 | class_names=class_names, 189 | with_label=False), 190 | dict(type='Collect3D', keys=['points']) 191 | ] 192 | 193 | data = dict( 194 | samples_per_gpu=4, 195 | workers_per_gpu=4, 196 | train=dict( 197 | type='RepeatDataset', 198 | times=2, #######5 199 | dataset=dict( 200 | type=dataset_type, 201 | data_root=data_root, 202 | ann_file=data_root + 'sunrgbd_infos_train.pkl', 203 | pipeline=train_pipeline, 204 | classes=class_names, 205 | filter_empty_gt=True, 206 | box_type_3d='Depth', 207 | file_client_args=file_client_args)), 208 | val=dict( 209 | type=dataset_type, 210 | data_root=data_root, 211 | ann_file=data_root + 'sunrgbd_infos_val.pkl', 212 | pipeline=test_pipeline, 213 | classes=class_names, 214 | test_mode=True, 215 | box_type_3d='Depth', 216 | file_client_args=file_client_args), 217 | test=dict( 218 | type=dataset_type, 219 | data_root=data_root, 220 | ann_file=data_root + 'sunrgbd_infos_val.pkl', 221 | pipeline=test_pipeline, 222 | classes=class_names, 223 | test_mode=True, 224 | box_type_3d='Depth', 225 | file_client_args=file_client_args)) 226 | 227 | evaluation = dict(pipeline=test_pipeline, interval=5) 228 | 229 | 230 | # optimizer 231 | # This schedule is mainly used by models on indoor dataset, 232 | # e.g., VoteNet on SUNRGBD and ScanNet 233 | lr = 2e-5 *2/8 * 20 # max learning rate 234 | optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) 235 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 236 | 237 | 238 | lr_config = dict(policy='step', warmup=None, step=[32, 38]) 239 | runner = dict(type='EpochBasedRunner', max_epochs=40) 240 | 241 | # fp16 setting 242 | fp16 = dict(loss_scale=32.) 243 | find_unused_parameters = True 244 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D 2 | from .core.bbox.coders.nms_free_coder import NMSFreeCoder 3 | from .core.bbox.match_costs import BBox3DL1Cost 4 | from .datasets import NuScenesSweepDataset 5 | from .datasets.pipelines import ( 6 | PhotoMetricDistortionMultiViewImage, PadMultiViewImage, NormalizeMultiviewImage, 7 | RandomScaleImageMultiViewImage, ImageRandomResizeCropFlip) 8 | from .models.backbones.vovnet import VoVNet 9 | from .models.detectors import Uni3DETR 10 | from .models.dense_heads import Uni3DETRHead 11 | from .models.pts_encoder import SparseEncoderHD 12 | from .models.necks import SECOND3DFPN 13 | from .models.losses import RDIoULoss, IoU3DLoss, SoftFocalLoss 14 | from .models.utils import Uni3DETRTransformer, Uni3DETRTransformerDecoder -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/__init__.py: -------------------------------------------------------------------------------- 1 | from .hungarian_assigner_3d import HungarianAssigner3D 2 | 3 | __all__ = ['HungarianAssigner3D'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS 4 | from mmdet.core.bbox.assigners import AssignResult 5 | from mmdet.core.bbox.assigners import BaseAssigner 6 | from mmdet.core.bbox.match_costs import build_match_cost 7 | from mmdet.models.utils.transformer import inverse_sigmoid 8 | from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox, denormalize_bbox 9 | from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d, bbox_overlaps_nearest_3d 10 | 11 | try: 12 | from scipy.optimize import linear_sum_assignment 13 | except ImportError: 14 | linear_sum_assignment = None 15 | 16 | 17 | @BBOX_ASSIGNERS.register_module() 18 | class HungarianAssigner3D(BaseAssigner): 19 | """Computes one-to-one matching between predictions and ground truth. 20 | This class computes an assignment between the targets and the predictions 21 | based on the costs. The costs are weighted sum of three components: 22 | classification cost, regression L1 cost and regression iou cost. The 23 | targets don't include the no_object, so generally there are more 24 | predictions than targets. After the one-to-one matching, the un-matched 25 | are treated as backgrounds. Thus each query prediction will be assigned 26 | with `0` or a positive integer indicating the ground truth index: 27 | - 0: negative sample, no assigned gt 28 | - positive integer: positive sample, index (1-based) of assigned gt 29 | Args: 30 | cls_weight (int | float, optional): The scale factor for classification 31 | cost. Default 1.0. 32 | bbox_weight (int | float, optional): The scale factor for regression 33 | L1 cost. Default 1.0. 34 | iou_weight (int | float, optional): The scale factor for regression 35 | iou cost. Default 1.0. 36 | iou_calculator (dict | optional): The config for the iou calculation. 37 | Default type `BboxOverlaps2D`. 38 | iou_mode (str | optional): "iou" (intersection over union), "iof" 39 | (intersection over foreground), or "giou" (generalized 40 | intersection over union). Default "giou". 41 | """ 42 | 43 | def __init__(self, 44 | cls_cost=dict(type='ClassificationCost', weight=1.), 45 | reg_cost=dict(type='BBoxL1Cost', weight=1.0), 46 | iou_cost=dict(type='IoUCost', weight=0.0), 47 | pc_range=None): 48 | self.cls_cost = build_match_cost(cls_cost) 49 | self.reg_cost = build_match_cost(reg_cost) 50 | self.iou_cost = build_match_cost(iou_cost) 51 | self.pc_range = pc_range 52 | 53 | def assign(self, 54 | bbox_pred, 55 | cls_pred, 56 | gt_bboxes, 57 | gt_labels, 58 | num_query, 59 | gt_bboxes_ignore=None, 60 | eps=1e-7, gt_repeattimes=1): 61 | """Computes one-to-one matching based on the weighted costs. 62 | This method assign each query prediction to a ground truth or 63 | background. The `assigned_gt_inds` with -1 means don't care, 64 | 0 means negative sample, and positive number is the index (1-based) 65 | of assigned gt. 66 | The assignment is done in the following steps, the order matters. 67 | 1. assign every prediction to -1 68 | 2. compute the weighted costs 69 | 3. do Hungarian matching on CPU based on the costs 70 | 4. assign all to 0 (background) first, then for each matched pair 71 | between predictions and gts, treat this prediction as foreground 72 | and assign the corresponding gt index (plus 1) to it. 73 | Args: 74 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 75 | (cx, cy, w, h), which are all in range [0, 1]. Shape 76 | [num_query, 4]. 77 | cls_pred (Tensor): Predicted classification logits, shape 78 | [num_query, num_class]. 79 | gt_bboxes (Tensor): Ground truth boxes with unnormalized 80 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 81 | gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). 82 | gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are 83 | labelled as `ignored`. Default None. 84 | eps (int | float, optional): A value added to the denominator for 85 | numerical stability. Default 1e-7. 86 | Returns: 87 | :obj:`AssignResult`: The assigned result. 88 | """ 89 | assert gt_bboxes_ignore is None, \ 90 | 'Only case when gt_bboxes_ignore is None is supported.' 91 | num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) 92 | 93 | # 1. assign -1 by default 94 | assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), 95 | -1, 96 | dtype=torch.long) 97 | assigned_labels = bbox_pred.new_full((num_bboxes, ), 98 | -1, 99 | dtype=torch.long) 100 | if num_gts == 0 or num_bboxes == 0: 101 | # No ground truth or boxes, return empty assignment 102 | if num_gts == 0: 103 | # No ground truth, assign all to background 104 | assigned_gt_inds[:] = 0 105 | return AssignResult( 106 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 107 | 108 | # 2. compute the weighted costs 109 | # classification and bboxcost. 110 | normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) 111 | bboxes3d = denormalize_bbox(bbox_pred, self.pc_range) 112 | iou3d = bbox_overlaps_nearest_3d(bboxes3d, gt_bboxes, coordinate='depth') 113 | 114 | cls_cost = self.cls_cost(cls_pred, gt_labels) 115 | #cls_cost = self.cls_cost(cls_pred, gt_labels, iou3d) 116 | 117 | # regression L1 cost 118 | reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) 119 | iou_cost = self.iou_cost(bboxes3d, gt_bboxes) 120 | 121 | cost = cls_cost + reg_cost + iou_cost 122 | 123 | # 3. do Hungarian matching on CPU using linear_sum_assignment 124 | cost = cost.detach().cpu() 125 | # cost[torch.isnan(cost)] = 1e5 126 | if linear_sum_assignment is None: 127 | raise ImportError('Please run pip install scipy to install scipy first.') 128 | 129 | nq = num_query 130 | ng = int(cost.shape[0] // nq) 131 | matched_row_inds, matched_col_inds = [], [] 132 | for g in range(ng): 133 | # matched_row_inds1, matched_col_inds1 = linear_sum_assignment(cost[g*nq:(g+1)*nq]) 134 | matched_row_inds1, matched_col_inds1 = linear_sum_assignment(cost[g*nq:(g+1)*nq].repeat(1, gt_repeattimes) ) 135 | matched_row_inds.append(g*nq + matched_row_inds1) 136 | #matched_col_inds.append(matched_col_inds1) 137 | matched_col_inds.append(matched_col_inds1 % cost.shape[1]) 138 | matched_row_inds = np.concatenate(matched_row_inds) 139 | matched_col_inds = np.concatenate(matched_col_inds) 140 | 141 | matched_row_inds = torch.from_numpy(matched_row_inds).to(bbox_pred.device) 142 | matched_col_inds = torch.from_numpy(matched_col_inds).to(bbox_pred.device) 143 | 144 | # 4. assign backgrounds and foregrounds 145 | # assign all indices to backgrounds first 146 | assigned_gt_inds[:] = 0 147 | # assign foregrounds based on matching results 148 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 149 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] 150 | return AssignResult( 151 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 152 | 153 | @staticmethod 154 | def _bbox_to_loss(bbox): 155 | # axis-aligned case: x, y, z, w, h, l -> x1, y1, z1, x2, y2, z2 156 | return torch.stack( 157 | (bbox[..., 0] - bbox[..., 3] / 2, bbox[..., 1] - bbox[..., 4] / 2, 158 | bbox[..., 2] - bbox[..., 5] / 2, bbox[..., 0] + bbox[..., 3] / 2, 159 | bbox[..., 1] + bbox[..., 4] / 2, bbox[..., 2] + bbox[..., 5] / 2), 160 | dim=-1) 161 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/bbox_merging.py: -------------------------------------------------------------------------------- 1 | """This file defines nms functions to merge boxes""" 2 | 3 | import time 4 | 5 | import cv2 6 | import numpy as np 7 | from shapely.geometry import Polygon 8 | 9 | from numba import jit 10 | 11 | def boxes_3d_to_corners(boxes_3d): 12 | all_corners = [] 13 | for box_3d in boxes_3d: 14 | x3d, y3d, z3d, l, h, w, yaw = box_3d 15 | R = np.array([[np.cos(yaw), 0, np.sin(yaw)], 16 | [0, 1, 0 ], 17 | [-np.sin(yaw), 0, np.cos(yaw)]]); 18 | corners = np.array([[ l/2, 0.0, w/2], # front up right 19 | [ l/2, 0.0, -w/2], # front up left 20 | [-l/2, 0.0, -w/2], # back up left 21 | [-l/2, 0.0, w/2], # back up right 22 | [ l/2, -h, w/2], # front down right 23 | [ l/2, -h, -w/2], # front down left 24 | [-l/2, -h, -w/2], # back down left 25 | [-l/2, -h, w/2]]) # back down right 26 | r_corners = corners.dot(np.transpose(R)) 27 | cam_points_xyz = r_corners+np.array([x3d, y3d, z3d]) 28 | all_corners.append(cam_points_xyz) 29 | return np.array(all_corners) 30 | 31 | def overlapped_boxes_3d(single_box, box_list): 32 | x0_max, y0_max, z0_max = np.max(single_box, axis=0) 33 | x0_min, y0_min, z0_min = np.min(single_box, axis=0) 34 | overlap = np.zeros(len(box_list)) 35 | for i, box in enumerate(box_list): 36 | x_max, y_max, z_max = np.max(box, axis=0) 37 | x_min, y_min, z_min = np.min(box, axis=0) 38 | if x0_max < x_min or x0_min > x_max: 39 | overlap[i] = 0 40 | continue 41 | if y0_max < y_min or y0_min > y_max: 42 | overlap[i] = 0 43 | continue 44 | if z0_max < z_min or z0_min > z_max: 45 | overlap[i] = 0 46 | continue 47 | x_draw_min = min(x0_min, x_min) 48 | x_draw_max = max(x0_max, x_max) 49 | z_draw_min = min(z0_min, z_min) 50 | z_draw_max = max(z0_max, z_max) 51 | offset = np.array([x_draw_min, z_draw_min]) 52 | buf1 = np.zeros((z_draw_max-z_draw_min, x_draw_max-x_draw_min), 53 | dtype=np.int32) 54 | buf2 = np.zeros_like(buf1) 55 | cv2.fillPoly(buf1, [single_box[:4, [0,2]]-offset], color=1) 56 | cv2.fillPoly(buf2, [box[:4, [0,2]]-offset], color=1) 57 | shared_area = cv2.countNonZero(buf1*buf2) 58 | area1 = cv2.countNonZero(buf1) 59 | area2 = cv2.countNonZero(buf2) 60 | shared_y = min(y_max, y0_max) - max(y_min, y0_min) 61 | intersection = shared_y * shared_area 62 | union = (y_max-y_min) * area2 + (y0_max-y0_min) * area1 63 | overlap[i] = np.float32(intersection) / (union - intersection) 64 | return overlap 65 | 66 | def overlapped_boxes_3d_fast_poly(single_box, box_list): 67 | single_box_max_corner = np.max(single_box, axis=0) 68 | single_box_min_corner = np.min(single_box, axis=0) 69 | x0_max, y0_max, z0_max = single_box_max_corner 70 | x0_min, y0_min, z0_min = single_box_min_corner 71 | max_corner = np.max(box_list, axis=1) 72 | min_corner = np.min(box_list, axis=1) 73 | overlap = np.zeros(len(box_list)) 74 | non_overlap_mask = np.logical_or(single_box_max_corner < min_corner, 75 | single_box_min_corner > max_corner) 76 | non_overlap_mask = np.any(non_overlap_mask, axis=1) 77 | p1 = Polygon(single_box[:4, [0,2]]) 78 | area1 = p1.area 79 | for i in range(len(box_list)): 80 | if not non_overlap_mask[i]: 81 | x_max, y_max, z_max = max_corner[i] 82 | x_min, y_min, z_min = min_corner[i] 83 | p2 = Polygon(box_list[i][:4, [0,2]]) 84 | shared_area = p1.intersection(p2).area 85 | area2 = p2.area 86 | shared_y = min(y_max, y0_max) - max(y_min, y0_min) 87 | intersection = shared_y * shared_area 88 | union = (y_max-y_min) * area2 + (y0_max-y0_min) * area1 89 | overlap[i] = np.float32(intersection) / (union - intersection) 90 | return overlap 91 | 92 | 93 | def bboxes_sort(classes, scores, bboxes, top_k=400, attributes=None): 94 | """Sort bounding boxes by decreasing order and keep only the top_k 95 | """ 96 | idxes = np.argsort(-scores) 97 | classes = classes[idxes] 98 | scores = scores[idxes] 99 | bboxes = bboxes[idxes] 100 | if attributes is not None: 101 | attributes = attributes[idxes] 102 | if top_k > 0: 103 | if len(idxes) > top_k: 104 | classes = classes[:top_k] 105 | scores = scores[:top_k] 106 | bboxes = bboxes[:top_k] 107 | if attributes is not None: 108 | attributes = attributes[:top_k] 109 | return classes, scores, bboxes, attributes 110 | 111 | 112 | def bboxes_nms_merge_only(classes, scores, bboxes, scores_threshold=0.25, 113 | nms_threshold=0.45, overlapped_fn=overlapped_boxes_3d_fast_poly, appr_factor=10.0, 114 | attributes=None): 115 | """Apply non-maximum selection to bounding boxes. 116 | """ 117 | boxes_corners = boxes_3d_to_corners(bboxes) 118 | # convert to pixels 119 | keep_bboxes = np.ones(scores.shape, dtype=np.bool) 120 | for i in range(scores.size-1): 121 | if keep_bboxes[i]: 122 | # Only compute on the rest of bboxes 123 | valid = keep_bboxes[(i+1):] 124 | # Computer overlap with bboxes which are following. 125 | overlap = overlapped_fn(boxes_corners[i], 126 | boxes_corners[(i+1):][valid]) 127 | # Overlap threshold for keeping + checking part of the same class 128 | remove_overlap = np.logical_and(overlap > nms_threshold, 129 | classes[(i+1):][valid] == classes[i]) 130 | overlaped_bboxes = np.concatenate( 131 | [bboxes[(i+1):][valid][remove_overlap], bboxes[[i]]], axis=0) 132 | boxes_mean = np.median(overlaped_bboxes, axis=0) 133 | # boxes_mean = np.mean(overlaped_bboxes, axis=0) 134 | bboxes[i][:] = boxes_mean[:] 135 | keep_bboxes[(i+1):][valid] = np.logical_not(remove_overlap)## 136 | 137 | idxes = np.where(keep_bboxes) 138 | classes = classes[idxes] 139 | scores = scores[idxes] 140 | bboxes = bboxes[idxes] 141 | if attributes is not None: 142 | attributes = attributes[idxes] 143 | return classes, scores, bboxes, idxes, #attributes 144 | 145 | def nms_boxes_3d_merge_only(class_labels, detection_boxes_3d, detection_scores, 146 | overlapped_thres=0.5, overlapped_fn=overlapped_boxes_3d_fast_poly, appr_factor=10.0, 147 | top_k=-1, attributes=None): 148 | class_labels, detection_scores, detection_boxes_3d, attributes = \ 149 | bboxes_sort( 150 | class_labels, detection_scores, detection_boxes_3d, top_k=top_k, 151 | attributes=attributes) 152 | # nms 153 | class_labels, detection_scores, detection_boxes_3d, attributes = \ 154 | bboxes_nms_merge_only( 155 | class_labels, detection_scores, detection_boxes_3d, 156 | nms_threshold=overlapped_thres, overlapped_fn=overlapped_fn, 157 | appr_factor=appr_factor, attributes=attributes) 158 | return class_labels, detection_boxes_3d, detection_scores, attributes 159 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms_free_coder import NMSFreeCoder 2 | 3 | __all__ = ['NMSFreeCoder'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mmdet.core.bbox import BaseBBoxCoder 4 | from mmdet.core.bbox.builder import BBOX_CODERS 5 | from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox 6 | from mmdet.models.utils.transformer import inverse_sigmoid 7 | 8 | 9 | @BBOX_CODERS.register_module() 10 | class NMSFreeCoder(BaseBBoxCoder): 11 | """Bbox coder for NMS-free detector. 12 | Args: 13 | pc_range (list[float]): Range of point cloud. 14 | post_center_range (list[float]): Limit of the center. 15 | Default: None. 16 | max_num (int): Max number to be kept. Default: 100. 17 | score_threshold (float): Threshold to filter boxes based on score. 18 | Default: None. 19 | code_size (int): Code size of bboxes. Default: 9 20 | """ 21 | 22 | def __init__(self, 23 | pc_range, 24 | voxel_size=None, 25 | post_center_range=None, 26 | max_num=100, 27 | score_threshold=None, 28 | alpha=0.5, 29 | num_classes=10): 30 | 31 | self.pc_range = pc_range 32 | self.voxel_size = voxel_size 33 | self.post_center_range = post_center_range 34 | self.max_num = max_num 35 | self.score_threshold = score_threshold 36 | self.num_classes = num_classes 37 | self.alpha = alpha 38 | 39 | def encode(self): 40 | pass 41 | 42 | def decode_single(self, cls_scores, bbox_preds, all_iou_preds): 43 | """Decode bboxes. 44 | Args: 45 | cls_scores (Tensor): Outputs from the classification head, \ 46 | shape [num_query, cls_out_channels]. Note \ 47 | cls_out_channels should includes background. 48 | bbox_preds (Tensor): Outputs from the regression \ 49 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 50 | Shape [num_query, 9]. 51 | Returns: 52 | list[dict]: Decoded boxes. 53 | """ 54 | max_num = self.max_num 55 | #max_num = cls_scores.numel() 56 | 57 | cls_scores = cls_scores.sigmoid() 58 | ious = all_iou_preds.sigmoid() 59 | 60 | scores, indexs = cls_scores.view(-1).topk(max_num) 61 | labels = indexs % self.num_classes 62 | bbox_index = indexs // self.num_classes 63 | bbox_preds = bbox_preds[bbox_index] 64 | 65 | final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) 66 | final_scores = scores 67 | final_preds = labels 68 | 69 | all_iou_preds = all_iou_preds.sigmoid() 70 | final_ious = all_iou_preds[bbox_index] 71 | 72 | # use score threshold 73 | if self.score_threshold is not None: 74 | thresh_mask = final_scores > self.score_threshold 75 | if self.post_center_range is not None: 76 | # self.post_center_range = torch.tensor(self.post_center_range, device=scores.device) 77 | self.post_center_range = scores.new_tensor(self.post_center_range) 78 | mask = (final_box_preds[..., :3] >= 79 | self.post_center_range[:3]).all(1) 80 | mask &= (final_box_preds[..., :3] <= 81 | self.post_center_range[3:]).all(1) 82 | 83 | if self.score_threshold: 84 | mask &= thresh_mask 85 | 86 | boxes3d = final_box_preds[mask] 87 | scores = final_scores[mask] 88 | labels = final_preds[mask] 89 | ious = final_ious[mask] 90 | 91 | predictions_dict = { 92 | 'bboxes': boxes3d, 93 | #'scores': scores, 94 | 'scores': scores ** self.alpha * ious.reshape(-1) ** (1-self.alpha), 95 | 'labels': labels, 96 | 'ious': ious.reshape(-1), 97 | } 98 | 99 | else: 100 | raise NotImplementedError( 101 | 'Need to reorganize output as a batch, only ' 102 | 'support post_center_range is not None for now!') 103 | return predictions_dict 104 | 105 | def decode(self, preds_dicts): 106 | """Decode bboxes. 107 | Args: 108 | all_cls_scores (Tensor): Outputs from the classification head, \ 109 | shape [nb_dec, bs, num_query, cls_out_channels]. Note \ 110 | cls_out_channels should includes background. 111 | all_bbox_preds (Tensor): Sigmoid outputs from the regression \ 112 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 113 | Shape [nb_dec, bs, num_query, 9]. 114 | Returns: 115 | list[dict]: Decoded boxes. 116 | """ 117 | # all_cls_scores = preds_dicts['all_cls_scores'][-1] 118 | # all_bbox_preds = preds_dicts['all_bbox_preds'][-1] 119 | # all_iou_preds = preds_dicts['all_iou_preds'][-1] 120 | 121 | all_cls_scores = torch.mean(preds_dicts['all_cls_scores'][1:], 0) 122 | all_bbox_preds = torch.mean(preds_dicts['all_bbox_preds'][1:], 0) 123 | all_iou_preds = torch.mean(preds_dicts['all_iou_preds'][1:], 0) 124 | 125 | #all_centerness_preds = torch.mean(preds_dicts['all_centerness_preds'][1:], 0) 126 | # all_cls_scores = torch.mean(preds_dicts['all_cls_scores'], 0) 127 | # all_bbox_preds = torch.mean(preds_dicts['all_bbox_preds'], 0) 128 | # all_cls_scores = 0. * preds_dicts['all_cls_scores'][0] + 0.4 * preds_dicts['all_cls_scores'][1] + 0.6 * preds_dicts['all_cls_scores'][2] 129 | # all_bbox_preds = 0. * preds_dicts['all_bbox_preds'][0] + 0.4 * preds_dicts['all_bbox_preds'][1] + 0.6 * preds_dicts['all_bbox_preds'][2] 130 | 131 | batch_size = all_cls_scores.size()[0] 132 | predictions_list = [] 133 | for i in range(batch_size): 134 | predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_iou_preds[i])) 135 | #predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_iou_preds[i], all_centerness_preds[i])) 136 | return predictions_list 137 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py: -------------------------------------------------------------------------------- 1 | from mmdet.core.bbox.match_costs import build_match_cost 2 | from .match_cost import BBox3DL1Cost, RotatedIoU3DCost, AxisAlignedIoU3DCost, RDIoUCost, SoftFocalLossCost 3 | 4 | __all__ = ['build_match_cost', 'BBox3DL1Cost', 'RotatedIoU3DCost', 'AxisAlignedIoU3DCost', 'RDIoUCost', 'SoftFocalLossCost'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST 3 | from mmcv.ops import diff_iou_rotated_3d 4 | from mmdet3d.core.bbox import AxisAlignedBboxOverlaps3D 5 | from projects.mmdet3d_plugin.core.bbox.util import get_rdiou 6 | from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d, bbox_overlaps_nearest_3d 7 | import torch.nn.functional as F 8 | 9 | @MATCH_COST.register_module() 10 | class BBox3DL1Cost(object): 11 | """BBox3DL1Cost. 12 | Args: 13 | weight (int | float, optional): loss_weight 14 | """ 15 | 16 | def __init__(self, weight=1.): 17 | self.weight = weight 18 | 19 | def __call__(self, bbox_pred, gt_bboxes): 20 | """ 21 | Args: 22 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 23 | (cx, cy, w, h), which are all in range [0, 1]. Shape 24 | [num_query, 4]. 25 | gt_bboxes (Tensor): Ground truth boxes with normalized 26 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 27 | Returns: 28 | torch.Tensor: bbox_cost value with weight 29 | """ 30 | bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) 31 | return bbox_cost * self.weight 32 | 33 | 34 | @MATCH_COST.register_module() 35 | class RotatedIoU3DCost(object): 36 | 37 | def __init__(self, weight=1.): 38 | self.weight = weight 39 | 40 | def __call__(self, bbox_pred, gt_bboxes): 41 | """ 42 | Args: 43 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 44 | (cx, cy, w, h), which are all in range [0, 1]. Shape 45 | [num_query, 4]. 46 | gt_bboxes (Tensor): Ground truth boxes with normalized 47 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 48 | Returns: 49 | torch.Tensor: bbox_cost value with weight 50 | """ 51 | #print(bbox_pred.shape, gt_bboxes.shape) 52 | N = gt_bboxes.shape[0] 53 | M = bbox_pred.shape[0] 54 | bbox_costs = [diff_iou_rotated_3d(bbox_pred.unsqueeze(0), gt_bboxes[[i], :].repeat(M, 1).unsqueeze(0))[0].unsqueeze(1) for i in range(N)] 55 | bbox_cost = torch.cat(bbox_costs, 1) 56 | 57 | return bbox_cost * self.weight 58 | 59 | 60 | @MATCH_COST.register_module() 61 | class AxisAlignedIoU3DCost(object): 62 | 63 | def __init__(self, weight=1.): 64 | self.weight = weight 65 | 66 | def __call__(self, bbox_pred, gt_bboxes): 67 | axis_aligned_iou = AxisAlignedBboxOverlaps3D()(bbox_pred, gt_bboxes) 68 | iou_loss = - axis_aligned_iou 69 | return iou_loss * self.weight 70 | 71 | @MATCH_COST.register_module() 72 | class RDIoUCost(object): 73 | 74 | def __init__(self, weight=1.): 75 | self.weight = weight 76 | 77 | def __call__(self, bbox_pred, gt_bboxes): 78 | u, rdiou = get_rdiou(bbox_pred.unsqueeze(1), gt_bboxes.unsqueeze(0)) 79 | 80 | rdiou_loss_n = rdiou - u 81 | rdiou_loss_n = torch.clamp(rdiou_loss_n,min=-1.0,max = 1.0) 82 | rdiou_loss_n = 1 - rdiou_loss_n 83 | return rdiou_loss_n * self.weight 84 | 85 | @MATCH_COST.register_module() 86 | class IoU3DCost(object): 87 | 88 | def __init__(self, weight=1.): 89 | self.weight = weight 90 | 91 | def __call__(self, bbox_pred, gt_bboxes): 92 | #iou3d = 1 - bbox_overlaps_3d(bbox_pred, gt_bboxes, coordinate='depth') 93 | #iou3d = (1 - bbox_overlaps_nearest_3d(bbox_pred, gt_bboxes, coordinate='depth') ) 94 | iou3d = (1 - bbox_overlaps_nearest_3d(bbox_pred, gt_bboxes, coordinate='lidar') ) ############ 95 | #iou3d += (1 - bbox_overlaps_nearest_3d(bbox_pred[:, [0,2,1,3,5,4,6]], gt_bboxes[:, [0,2,1,3,5,4,6]], coordinate='depth') ) * 0.1 96 | #iou3d += (1 - bbox_overlaps_nearest_3d(bbox_pred[:, [1,2,0,4,5,3,6]], gt_bboxes[:, [1,2,0,4,5,3,6]], coordinate='depth') ) * 0.1 97 | return iou3d * self.weight 98 | 99 | 100 | @MATCH_COST.register_module() 101 | class SoftFocalLossCost(object): 102 | 103 | def __init__(self, 104 | weight=1., 105 | alpha=0.25, 106 | gamma=2, 107 | eps=1e-12, 108 | binary_input=False): 109 | self.weight = weight 110 | self.alpha = alpha 111 | self.gamma = gamma 112 | self.eps = eps 113 | self.binary_input = binary_input 114 | 115 | 116 | def __call__(self, cls_pred, gt_labels, iou3d): 117 | 118 | cls_pred = cls_pred.sigmoid() 119 | 120 | iou3d = iou3d.pow(0.001) 121 | neg_cost = -(1 - cls_pred * iou3d + self.eps).log() * ( 122 | 1 - self.alpha) * (cls_pred * iou3d).pow(self.gamma) 123 | 124 | pos_cost = -(cls_pred * iou3d + self.eps).log() * self.alpha * ( 125 | 1 - cls_pred * iou3d).pow(self.gamma) 126 | 127 | cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels] 128 | 129 | return cls_cost * self.weight -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/bbox/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import mmdet3d 4 | 5 | __mmdet3d_version__ = float(mmdet3d.__version__[:3]) 6 | 7 | 8 | def normalize_bbox(bboxes, pc_range=None): 9 | 10 | cx = bboxes[..., 0:1] 11 | cy = bboxes[..., 1:2] 12 | cz = bboxes[..., 2:3] 13 | # align coord system with previous version 14 | if __mmdet3d_version__ < 1.0: 15 | # w = bboxes[..., 3:4] 16 | # l = bboxes[..., 4:5] 17 | # h = bboxes[..., 5:6] 18 | w = bboxes[..., 3:4].log() 19 | l = bboxes[..., 4:5].log() 20 | h = bboxes[..., 5:6].log() 21 | rot = bboxes[..., 6:7] 22 | else: 23 | # l = bboxes[..., 3:4] 24 | # w = bboxes[..., 4:5] 25 | # h = bboxes[..., 5:6] 26 | l = (bboxes[..., 3:4] + 1e-5).log() 27 | w = (bboxes[..., 4:5] + 1e-5).log() 28 | h = (bboxes[..., 5:6] + 1e-5).log() 29 | rot = bboxes[..., 6:7] 30 | rot = -rot - np.pi / 2 31 | 32 | if bboxes.size(-1) > 7: 33 | vx = bboxes[..., 7:8] 34 | vy = bboxes[..., 8:9] 35 | normalized_bboxes = torch.cat( 36 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 37 | ) 38 | else: 39 | normalized_bboxes = torch.cat( 40 | (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 41 | ) 42 | return normalized_bboxes 43 | 44 | def denormalize_bbox(normalized_bboxes, pc_range=None, version=0.8): 45 | # rotation 46 | rot_sine = normalized_bboxes[..., 6:7] 47 | 48 | rot_cosine = normalized_bboxes[..., 7:8] 49 | rot = torch.atan2(rot_sine, rot_cosine) 50 | 51 | # align coord system with previous version 52 | if __mmdet3d_version__ >= 1.0: 53 | rot = -rot - np.pi / 2 54 | # center in the bev 55 | cx = normalized_bboxes[..., 0:1] 56 | cy = normalized_bboxes[..., 1:2] 57 | cz = normalized_bboxes[..., 4:5] 58 | 59 | # size 60 | w = normalized_bboxes[..., 2:3] 61 | l = normalized_bboxes[..., 3:4] 62 | h = normalized_bboxes[..., 5:6] 63 | 64 | w = w.exp() 65 | l = l.exp() 66 | h = h.exp() 67 | if normalized_bboxes.size(-1) > 8: 68 | # velocity 69 | vx = normalized_bboxes[..., 8:9] 70 | vy = normalized_bboxes[..., 9:10] 71 | if __mmdet3d_version__ < 1.0: 72 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) 73 | else: 74 | denormalized_bboxes = torch.cat([cx, cy, cz, l, w, h, rot, vx, vy], dim=-1) 75 | else: 76 | if __mmdet3d_version__ < 1.0: 77 | denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) 78 | else: 79 | denormalized_bboxes = torch.cat([cx, cy, cz, l, w, h, rot], dim=-1) 80 | return denormalized_bboxes 81 | 82 | def bbox3d_mapping_back(bboxes, rot_degree, scale_factor, flip_horizontal, flip_vertical): 83 | """Map bboxes from testing scale to original image scale. 84 | 85 | Args: 86 | bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. 87 | scale_factor (float): Scale factor. 88 | flip_horizontal (bool): Whether to flip horizontally. 89 | flip_vertical (bool): Whether to flip vertically. 90 | 91 | Returns: 92 | :obj:`BaseInstance3DBoxes`: Boxes mapped back. 93 | """ 94 | new_bboxes = bboxes.clone() 95 | if flip_horizontal: 96 | new_bboxes.flip('horizontal') 97 | if flip_vertical: 98 | new_bboxes.flip('vertical') 99 | new_bboxes.scale(1 / scale_factor) 100 | new_bboxes.rotate(-rot_degree) 101 | 102 | return new_bboxes 103 | 104 | def get_rdiou(bboxes1, bboxes2): 105 | x1u, y1u, z1u = bboxes1[:,:,0], bboxes1[:,:,1], bboxes1[:,:,2] 106 | l1, w1, h1 = torch.exp(bboxes1[:,:,3]), torch.exp(bboxes1[:,:,4]), torch.exp(bboxes1[:,:,5]) 107 | t1 = torch.sin(bboxes1[:,:,6]) * torch.cos(bboxes2[:,:,6]) 108 | x2u, y2u, z2u = bboxes2[:,:,0], bboxes2[:,:,1], bboxes2[:,:,2] 109 | l2, w2, h2 = torch.exp(bboxes2[:,:,3]), torch.exp(bboxes2[:,:,4]), torch.exp(bboxes2[:,:,5]) 110 | t2 = torch.cos(bboxes1[:,:,6]) * torch.sin(bboxes2[:,:,6]) 111 | 112 | # we emperically scale the y/z to make their predictions more sensitive. 113 | x1 = x1u 114 | y1 = y1u * 2 115 | z1 = z1u * 2 116 | x2 = x2u 117 | y2 = y2u * 2 118 | z2 = z2u * 2 119 | 120 | # clamp is necessray to aviod inf. 121 | l1, w1, h1 = torch.clamp(l1, max=10), torch.clamp(w1, max=10), torch.clamp(h1, max=10) 122 | j1, j2 = torch.ones_like(h2), torch.ones_like(h2) 123 | 124 | volume_1 = l1 * w1 * h1 * j1 125 | volume_2 = l2 * w2 * h2 * j2 126 | 127 | inter_l = torch.max(x1 - l1 / 2, x2 - l2 / 2) 128 | inter_r = torch.min(x1 + l1 / 2, x2 + l2 / 2) 129 | inter_t = torch.max(y1 - w1 / 2, y2 - w2 / 2) 130 | inter_b = torch.min(y1 + w1 / 2, y2 + w2 / 2) 131 | inter_u = torch.max(z1 - h1 / 2, z2 - h2 / 2) 132 | inter_d = torch.min(z1 + h1 / 2, z2 + h2 / 2) 133 | inter_m = torch.max(t1 - j1 / 2, t2 - j2 / 2) 134 | inter_n = torch.min(t1 + j1 / 2, t2 + j2 / 2) 135 | 136 | inter_volume = torch.clamp((inter_r - inter_l),min=0) * torch.clamp((inter_b - inter_t),min=0) \ 137 | * torch.clamp((inter_d - inter_u),min=0) * torch.clamp((inter_n - inter_m),min=0) 138 | 139 | c_l = torch.min(x1 - l1 / 2,x2 - l2 / 2) 140 | c_r = torch.max(x1 + l1 / 2,x2 + l2 / 2) 141 | c_t = torch.min(y1 - w1 / 2,y2 - w2 / 2) 142 | c_b = torch.max(y1 + w1 / 2,y2 + w2 / 2) 143 | c_u = torch.min(z1 - h1 / 2,z2 - h2 / 2) 144 | c_d = torch.max(z1 + h1 / 2,z2 + h2 / 2) 145 | c_m = torch.min(t1 - j1 / 2,t2 - j2 / 2) 146 | c_n = torch.max(t1 + j1 / 2,t2 + j2 / 2) 147 | 148 | inter_diag = (x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2 + (t2 - t1)**2 149 | c_diag = torch.clamp((c_r - c_l),min=0)**2 + torch.clamp((c_b - c_t),min=0)**2 + torch.clamp((c_d - c_u),min=0)**2 + torch.clamp((c_n - c_m),min=0)**2 150 | 151 | union = volume_1 + volume_2 - inter_volume 152 | u = (inter_diag) / c_diag 153 | rdiou = inter_volume / union 154 | return u, rdiou -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/core/merge_all_augs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | #from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu 5 | from mmdet3d.core.bbox import bbox3d2result, xywhr2xyxyr 6 | from .bbox.util import bbox3d_mapping_back 7 | from mmdet3d.core.post_processing import nms_bev, nms_normal_bev 8 | 9 | def merge_all_aug_bboxes_3d(aug_results, img_metas, test_cfg): 10 | """Merge augmented detection 3D bboxes and scores. 11 | 12 | Args: 13 | aug_results (list[dict]): The dict of detection results. 14 | The dict contains the following keys 15 | 16 | - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. 17 | - scores_3d (torch.Tensor): Detection scores. 18 | - labels_3d (torch.Tensor): Predicted box labels. 19 | img_metas (list[dict]): Meta information of each sample. 20 | test_cfg (dict): Test config. 21 | 22 | Returns: 23 | dict: Bounding boxes results in cpu mode, containing merged results. 24 | 25 | - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox. 26 | - scores_3d (torch.Tensor): Merged detection scores. 27 | - labels_3d (torch.Tensor): Merged predicted box labels. 28 | """ 29 | 30 | assert len(aug_results) == len(img_metas), \ 31 | '"aug_results" should have the same length as "img_metas", got len(' \ 32 | f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}' 33 | 34 | recovered_bboxes = [] 35 | recovered_scores = [] 36 | recovered_labels = [] 37 | 38 | for bboxes, img_info in zip(aug_results, img_metas): 39 | scale_factor = img_info[0]['pcd_scale_factor'] 40 | # print(bboxes) 41 | rotate_degree = img_info[0].get('rot_degree', torch.tensor(0., device=bboxes['scores_3d'].device)) #img_info[0]['rot_degree'] 42 | pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip'] 43 | pcd_vertical_flip = img_info[0]['pcd_vertical_flip'] 44 | # print(bboxes) 45 | recovered_scores.append(bboxes['scores_3d']) 46 | recovered_labels.append(bboxes['labels_3d']) 47 | bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], rotate_degree, scale_factor, #boxes_3d 48 | pcd_horizontal_flip, pcd_vertical_flip) 49 | recovered_bboxes.append(bboxes) 50 | 51 | aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes) 52 | aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev) 53 | aug_scores = torch.cat(recovered_scores, dim=0) 54 | aug_labels = torch.cat(recovered_labels, dim=0) 55 | 56 | # TODO: use a more elegent way to deal with nms 57 | if True: #test_cfg.use_rotate_nms: 58 | nms_func = nms_bev #nms_gpu 59 | else: 60 | nms_func = nms_normal_gpu 61 | 62 | merged_bboxes = [] 63 | merged_scores = [] 64 | merged_labels = [] 65 | 66 | # Apply multi-class nms when merge bboxes 67 | if len(aug_labels) == 0: 68 | return bbox3d2result(aug_bboxes, aug_scores, aug_labels) 69 | 70 | for class_id in range(int(torch.max(aug_labels).item() + 1)): 71 | # print(aug_labels) 72 | class_inds = (aug_labels == class_id) 73 | bboxes_i = aug_bboxes[class_inds] 74 | bboxes_nms_i = aug_bboxes_for_nms[class_inds, :] 75 | scores_i = aug_scores[class_inds] 76 | labels_i = aug_labels[class_inds] 77 | if len(bboxes_nms_i) == 0: 78 | continue 79 | selected = nms_func(bboxes_nms_i, scores_i, 0.1) #test_cfg.nms_thr) 80 | # print('bbb', selected) 81 | merged_bboxes.append(bboxes_i[selected, :]) 82 | merged_scores.append(scores_i[selected]) 83 | merged_labels.append(labels_i[selected]) 84 | 85 | # print(merged_bboxes) 86 | merged_bboxes = merged_bboxes[0].cat(merged_bboxes) 87 | merged_scores = torch.cat(merged_scores, dim=0) 88 | merged_labels = torch.cat(merged_labels, dim=0) 89 | 90 | _, order = merged_scores.sort(0, descending=True) 91 | num = min(500, len(aug_bboxes)) # min(test_cfg.max_num, len(aug_bboxes)) 92 | order = order[:num] 93 | 94 | merged_bboxes = merged_bboxes[order] 95 | merged_scores = merged_scores[order] 96 | merged_labels = merged_labels[order] 97 | 98 | return bbox3d2result(merged_bboxes, merged_scores, merged_labels) 99 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .nuscenes_dataset import NuScenesSweepDataset 2 | from .sunrgbd_dataset_ov import SUNRGBDDataset_OV 3 | 4 | __all__ = [ 5 | 'NuScenesSweepDataset', 'SUNRGBDDataset_OV' 6 | ] 7 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .transform_3d import ( 2 | PadMultiViewImage, NormalizeMultiviewImage, 3 | PhotoMetricDistortionMultiViewImage, 4 | RandomScaleImageMultiViewImage, 5 | ImageRandomResizeCropFlip, 6 | UnifiedRandomFlip3D, UnifiedRotScaleTrans) 7 | from .loading_3d import (LoadMultiViewMultiSweepImageFromFiles, LoadMultiViewMultiSweepImageFromFilesIndoor) 8 | from .dbsampler import UnifiedDataBaseSampler 9 | from .formatting import CollectUnified3D 10 | from .test_time_aug import MultiRotScaleFlipAug3D 11 | 12 | __all__ = [ 13 | 'PadMultiViewImage', 'NormalizeMultiviewImage', 14 | 'PhotoMetricDistortionMultiViewImage', 'LoadMultiViewMultiSweepImageFromFilesIndoor', 15 | 'RandomScaleImageMultiViewImage', 'ImageRandomResizeCropFlip', 16 | 'LoadMultiViewMultiSweepImageFromFiles', 17 | 'UnifiedRandomFlip3D', 'UnifiedRotScaleTrans', 'UnifiedDataBaseSampler', 18 | 'MultiRotScaleFlipAug3D' 19 | ] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/formatting.py: -------------------------------------------------------------------------------- 1 | from mmdet.datasets.builder import PIPELINES 2 | from mmcv.parallel import DataContainer as DC 3 | 4 | @PIPELINES.register_module() 5 | class CollectUnified3D(object): 6 | """Collect data from the loader relevant to the specific task. 7 | 8 | This is usually the last stage of the data loader pipeline. Typically keys 9 | is set to some subset of "img", "proposals", "gt_bboxes", 10 | "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". 11 | 12 | The "img_meta" item is always populated. The contents of the "img_meta" 13 | dictionary depends on "meta_keys". By default this includes: 14 | 15 | - 'img_shape': shape of the image input to the network as a tuple \ 16 | (h, w, c). Note that images may be zero padded on the \ 17 | bottom/right if the batch tensor is larger than this shape. 18 | - 'scale_factor': a float indicating the preprocessing scale 19 | - 'flip': a boolean indicating if image flip transform was used 20 | - 'filename': path to the image file 21 | - 'ori_shape': original shape of the image as a tuple (h, w, c) 22 | - 'pad_shape': image shape after padding 23 | - 'lidar2img': transform from lidar to image 24 | - 'depth2img': transform from depth to image 25 | - 'cam2img': transform from camera to image 26 | - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ 27 | flipped horizontally 28 | - 'pcd_vertical_flip': a boolean indicating if point cloud is \ 29 | flipped vertically 30 | - 'box_mode_3d': 3D box mode 31 | - 'box_type_3d': 3D box type 32 | - 'img_norm_cfg': a dict of normalization information: 33 | - mean: per channel mean subtraction 34 | - std: per channel std divisor 35 | - to_rgb: bool indicating if bgr was converted to rgb 36 | - 'pcd_trans': point cloud transformations 37 | - 'sample_idx': sample index 38 | - 'pcd_scale_factor': point cloud scale factor 39 | - 'pcd_rotation': rotation applied to point cloud 40 | - 'pts_filename': path to point cloud file. 41 | 42 | Args: 43 | keys (Sequence[str]): Keys of results to be collected in ``data``. 44 | meta_keys (Sequence[str], optional): Meta keys to be converted to 45 | ``mmcv.DataContainer`` and collected in ``data[img_metas]``. 46 | Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', 47 | 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 48 | 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 49 | 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 50 | 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') 51 | """ 52 | 53 | def __init__(self, 54 | keys, 55 | meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 56 | 'depth2img', 'cam2img', 'pad_shape', 57 | 'scale_factor', 'flip', 'pcd_horizontal_flip', 58 | 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 59 | 'img_norm_cfg', 'pcd_trans', 'sample_idx', 60 | 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 61 | 'transformation_3d_flow', 'sweeps_paths', 'sweeps_ids', 62 | 'sweeps_time', 'uni_rot_aug', 'uni_trans_aug', 'uni_flip_aug', 63 | 'img_rot_aug', 'img_trans_aug', 'rot_degree')): 64 | self.keys = keys 65 | self.meta_keys = meta_keys 66 | 67 | def __call__(self, results): 68 | """Call function to collect keys in results. The keys in ``meta_keys`` 69 | will be converted to :obj:`mmcv.DataContainer`. 70 | 71 | Args: 72 | results (dict): Result dict contains the data to collect. 73 | 74 | Returns: 75 | dict: The result dict contains the following keys 76 | - keys in ``self.keys`` 77 | - ``img_metas`` 78 | """ 79 | data = {} 80 | img_metas = {} 81 | for key in self.meta_keys: 82 | if key in results: 83 | img_metas[key] = results[key] 84 | 85 | data['img_metas'] = DC(img_metas, cpu_only=True) 86 | for key in self.keys: 87 | data[key] = results[key] 88 | 89 | return data 90 | 91 | def __repr__(self): 92 | """str: Return a string that describes the module.""" 93 | return self.__class__.__name__ + \ 94 | f'(keys={self.keys}, meta_keys={self.meta_keys})' -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/loading_3d.py: -------------------------------------------------------------------------------- 1 | from re import I 2 | import mmcv 3 | import numpy as np 4 | 5 | from mmdet.datasets.builder import PIPELINES 6 | 7 | 8 | @PIPELINES.register_module() 9 | class LoadMultiViewMultiSweepImageFromFilesIndoor(object): 10 | """Load multi channel images from a list of separate channel files. 11 | 12 | Expects results['img_filename'] to be a list of filenames. 13 | 14 | Args: 15 | to_float32 (bool): Whether to convert the img to float32. 16 | Defaults to False. 17 | color_type (str): Color type of the file. Defaults to 'unchanged'. 18 | """ 19 | 20 | def __init__(self, to_float32=False, sweep_num=1, random_sweep=False, color_type='unchanged'): 21 | self.to_float32 = to_float32 22 | self.color_type = color_type 23 | self.sweep_num = sweep_num 24 | self.random_sweep = random_sweep 25 | 26 | def __call__(self, results): 27 | """Call function to load multi-view image from files. 28 | 29 | Args: 30 | results (dict): Result dict containing multi-view image filenames. 31 | 32 | Returns: 33 | dict: The result dict containing the multi-view image data. \ 34 | Added keys and values are described below. 35 | 36 | - filename (str): Multi-view image filenames. 37 | - img (np.ndarray): Multi-view image arrays. 38 | - img_shape (tuple[int]): Shape of multi-view image arrays. 39 | - ori_shape (tuple[int]): Shape of original image arrays. 40 | - pad_shape (tuple[int]): Shape of padded image arrays. 41 | - scale_factor (float): Scale factor. 42 | - img_norm_cfg (dict): Normalization configuration of images. 43 | """ 44 | # print(results) 45 | filename = [results['img_info']['filename']] 46 | results['filename'] = filename 47 | # img is of shape (h, w, c, num_views) 48 | img = np.stack( 49 | [mmcv.imread(name, self.color_type) for name in filename], axis=-1) 50 | 51 | if self.to_float32: 52 | img = img.astype(np.float32) 53 | 54 | # unravel to list, see `DefaultFormatBundle` in formating.py 55 | # which will transpose each image separately and then stack into array 56 | results['img'] = [img[..., i] for i in range(img.shape[-1])] 57 | # results['img'] = results['img'][0] 58 | results['img_shape'] = img.shape 59 | results['ori_shape'] = img.shape 60 | # Set initial values for default meta_keys 61 | results['pad_shape'] = img.shape 62 | results['scale_factor'] = 1.0 63 | num_channels = 1 if len(img.shape) < 3 else img.shape[2] 64 | results['img_norm_cfg'] = dict( 65 | mean=np.zeros(num_channels, dtype=np.float32), 66 | std=np.ones(num_channels, dtype=np.float32), 67 | to_rgb=False) 68 | 69 | if 'depth2img' in results: 70 | lidar2img = np.eye(4) 71 | if results['depth2img'].shape[0] == 3: 72 | lidar2img[:3, :3] = results['depth2img'] 73 | else: 74 | lidar2img[:4, :4] = results['depth2img'] 75 | else: 76 | lidar2img = np.eye(4) 77 | if results['lidar2img'].shape[0] == 3: 78 | lidar2img[:3, :3] = results['lidar2img'] 79 | else: 80 | lidar2img[:4, :4] = results['lidar2img'] 81 | results['lidar2img'] = [lidar2img] 82 | 83 | return results 84 | 85 | def __repr__(self): 86 | """str: Return a string that describes the module.""" 87 | repr_str = self.__class__.__name__ 88 | repr_str += f'(to_float32={self.to_float32}, ' 89 | repr_str += f"color_type='{self.color_type}')" 90 | return repr_str 91 | 92 | @PIPELINES.register_module() 93 | class LoadMultiViewMultiSweepImageFromFiles(object): 94 | """Load multi channel images from a list of separate channel files. 95 | 96 | Expects results['img_filename'] to be a list of filenames. 97 | 98 | Args: 99 | to_float32 (bool): Whether to convert the img to float32. 100 | Defaults to False. 101 | color_type (str): Color type of the file. Defaults to 'unchanged'. 102 | """ 103 | 104 | def __init__(self, to_float32=False, sweep_num=1, random_sweep=False, color_type='unchanged'): 105 | self.to_float32 = to_float32 106 | self.color_type = color_type 107 | self.sweep_num = sweep_num 108 | self.random_sweep = random_sweep 109 | 110 | def __call__(self, results): 111 | """Call function to load multi-view image from files. 112 | 113 | Args: 114 | results (dict): Result dict containing multi-view image filenames. 115 | 116 | Returns: 117 | dict: The result dict containing the multi-view image data. \ 118 | Added keys and values are described below. 119 | 120 | - filename (str): Multi-view image filenames. 121 | - img (np.ndarray): Multi-view image arrays. 122 | - img_shape (tuple[int]): Shape of multi-view image arrays. 123 | - ori_shape (tuple[int]): Shape of original image arrays. 124 | - pad_shape (tuple[int]): Shape of padded image arrays. 125 | - scale_factor (float): Scale factor. 126 | - img_norm_cfg (dict): Normalization configuration of images. 127 | """ 128 | filename = results['img_filename'] 129 | results['filename'] = filename 130 | # img is of shape (h, w, c, num_views) 131 | img = np.stack( 132 | [mmcv.imread(name, self.color_type) for name in filename], axis=-1) 133 | 134 | img_sweeps = [] 135 | sweeps_paths = results['cam_sweeps_paths'] 136 | sweeps_ids = results['cam_sweeps_id'] 137 | sweeps_time = results['cam_sweeps_time'] 138 | if self.random_sweep: 139 | random_num = np.random.randint(0, self.sweep_num) 140 | sweeps_paths = [_sweep[:random_num] for _sweep in sweeps_paths] 141 | sweeps_ids = [_sweep[:random_num] for _sweep in sweeps_ids] 142 | else: 143 | random_num = self.sweep_num 144 | 145 | for _idx in range(len(sweeps_paths[0])): 146 | _sweep = np.stack( 147 | [mmcv.imread(name_list[_idx], self.color_type) for name_list in sweeps_paths], axis=-1) 148 | img_sweeps.append(_sweep) 149 | 150 | # add img sweeps to raw image 151 | img = np.stack([img, *img_sweeps], axis=-1) 152 | # img is of shape (h, w, c, num_views * sweep_num) 153 | img = img.reshape(*img.shape[:-2], -1) 154 | 155 | if self.to_float32: 156 | img = img.astype(np.float32) 157 | 158 | results['sweeps_paths'] = [[filename[_idx]] + sweeps_paths[_idx] for _idx in range(len(filename))] 159 | results['sweeps_ids'] = np.stack([[0]+_id for _id in sweeps_ids], axis=-1) 160 | results['sweeps_time'] = np.stack([[0]+_time for _time in sweeps_time], axis=-1) 161 | # unravel to list, see `DefaultFormatBundle` in formating.py 162 | # which will transpose each image separately and then stack into array 163 | results['img'] = [img[..., i] for i in range(img.shape[-1])] 164 | results['img_shape'] = img.shape 165 | results['ori_shape'] = img.shape 166 | # Set initial values for default meta_keys 167 | results['pad_shape'] = img.shape 168 | results['scale_factor'] = 1.0 169 | num_channels = 1 if len(img.shape) < 3 else img.shape[2] 170 | results['img_norm_cfg'] = dict( 171 | mean=np.zeros(num_channels, dtype=np.float32), 172 | std=np.ones(num_channels, dtype=np.float32), 173 | to_rgb=False) 174 | 175 | # add sweep matrix to raw matrix 176 | results['lidar2img'] = [np.stack([results['lidar2img'][_idx], 177 | *results['lidar2img_sweeps'][_idx][:random_num]], axis=0) 178 | for _idx in range(len(results['lidar2img']))] 179 | results['lidar2cam'] = [np.stack([results['lidar2cam'][_idx], 180 | *results['lidar2cam_sweeps'][_idx][:random_num]], axis=0) 181 | for _idx in range(len(results['lidar2cam']))] 182 | results['cam_intrinsic'] = [np.stack([results['cam_intrinsic'][_idx], 183 | *results['cam_sweeps_intrinsics'][_idx][:random_num]], axis=0) 184 | for _idx in range(len(results['cam_intrinsic']))] 185 | results.pop('lidar2img_sweeps') 186 | results.pop('lidar2cam_sweeps') 187 | results.pop('cam_sweeps_intrinsics') 188 | 189 | return results 190 | 191 | def __repr__(self): 192 | """str: Return a string that describes the module.""" 193 | repr_str = self.__class__.__name__ 194 | repr_str += f'(to_float32={self.to_float32}, ' 195 | repr_str += f"color_type='{self.color_type}')" 196 | return repr_str -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/pipelines/test_time_aug.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import mmcv 3 | import warnings 4 | from copy import deepcopy 5 | 6 | from mmdet.datasets.builder import PIPELINES 7 | from mmdet.datasets.pipelines import Compose 8 | 9 | 10 | @PIPELINES.register_module() 11 | class MultiRotScaleFlipAug3D(object): 12 | """Test-time augmentation with multiple scales and flipping. 13 | 14 | Args: 15 | transforms (list[dict]): Transforms to apply in each augmentation. 16 | img_scale (tuple | list[tuple]: Images scales for resizing. 17 | pts_scale_ratio (float | list[float]): Points scale ratios for 18 | resizing. 19 | flip (bool): Whether apply flip augmentation. Defaults to False. 20 | flip_direction (str | list[str]): Flip augmentation directions 21 | for images, options are "horizontal" and "vertical". 22 | If flip_direction is list, multiple flip augmentations will 23 | be applied. It has no effect when ``flip == False``. 24 | Defaults to "horizontal". 25 | pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation 26 | to point cloud. Defaults to True. Note that it works only when 27 | 'flip' is turned on. 28 | pcd_vertical_flip (bool): Whether apply vertical flip augmentation 29 | to point cloud. Defaults to True. Note that it works only when 30 | 'flip' is turned on. 31 | """ 32 | 33 | def __init__(self, 34 | transforms, 35 | img_scale, 36 | pts_scale_ratio, 37 | rotate_degree=[0.0], 38 | flip=False, 39 | flip_direction='horizontal', 40 | pcd_horizontal_flip=False, 41 | pcd_vertical_flip=False): 42 | self.transforms = Compose(transforms) 43 | self.img_scale = img_scale if isinstance(img_scale, 44 | list) else [img_scale] 45 | self.pts_scale_ratio = pts_scale_ratio \ 46 | if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)] 47 | 48 | assert mmcv.is_list_of(self.img_scale, tuple) 49 | assert mmcv.is_list_of(self.pts_scale_ratio, float) 50 | 51 | self.rotate_degree = rotate_degree 52 | 53 | self.flip = flip 54 | self.pcd_horizontal_flip = pcd_horizontal_flip 55 | self.pcd_vertical_flip = pcd_vertical_flip 56 | 57 | self.flip_direction = flip_direction if isinstance( 58 | flip_direction, list) else [flip_direction] 59 | assert mmcv.is_list_of(self.flip_direction, str) 60 | if not self.flip and self.flip_direction != ['horizontal']: 61 | warnings.warn( 62 | 'flip_direction has no effect when flip is set to False') 63 | if (self.flip and not any([(t['type'] == 'RandomFlip3D' 64 | or t['type'] == 'RandomFlip') 65 | for t in transforms])): 66 | warnings.warn( 67 | 'flip has no effect when RandomFlip is not in transforms') 68 | 69 | def __call__(self, results): 70 | """Call function to augment common fields in results. 71 | 72 | Args: 73 | results (dict): Result dict contains the data to augment. 74 | 75 | Returns: 76 | dict: The result dict contains the data that is augmented with \ 77 | different scales and flips. 78 | """ 79 | aug_data = [] 80 | 81 | # modified from `flip_aug = [False, True] if self.flip else [False]` 82 | # to reduce unnecessary scenes when using double flip augmentation 83 | # during test time 84 | flip_aug = [True] if self.flip else [False] 85 | pcd_horizontal_flip_aug = [False, True] \ 86 | if self.flip and self.pcd_horizontal_flip else [False] 87 | pcd_vertical_flip_aug = [False, True] \ 88 | if self.flip and self.pcd_vertical_flip else [False] 89 | for rot_degree in self.rotate_degree: 90 | for scale in self.img_scale: 91 | for pts_scale_ratio in self.pts_scale_ratio: 92 | for flip in flip_aug: 93 | for pcd_horizontal_flip in pcd_horizontal_flip_aug: 94 | for pcd_vertical_flip in pcd_vertical_flip_aug: 95 | for direction in self.flip_direction: 96 | # results.copy will cause bug 97 | # since it is shallow copy 98 | _results = deepcopy(results) 99 | _results['rot_degree'] = rot_degree 100 | _results['scale'] = scale 101 | _results['flip'] = flip 102 | _results['pcd_scale_factor'] = \ 103 | pts_scale_ratio 104 | _results['flip_direction'] = direction 105 | _results['pcd_horizontal_flip'] = \ 106 | pcd_horizontal_flip 107 | _results['pcd_vertical_flip'] = \ 108 | pcd_vertical_flip 109 | data = self.transforms(_results) 110 | aug_data.append(data) 111 | # list of dict to dict of list 112 | aug_data_dict = {key: [] for key in aug_data[0]} 113 | for data in aug_data: 114 | for key, val in data.items(): 115 | aug_data_dict[key].append(val) 116 | return aug_data_dict 117 | 118 | def __repr__(self): 119 | """str: Return a string that describes the module.""" 120 | repr_str = self.__class__.__name__ 121 | repr_str += f'(transforms={self.transforms}, ' 122 | repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' 123 | repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, ' 124 | repr_str += f'flip_direction={self.flip_direction})' 125 | return repr_str 126 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/datasets/sunrgbd_dataset_ov.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import mmcv 3 | import numpy as np 4 | import pyquaternion 5 | import tempfile 6 | from nuscenes.utils.data_classes import Box as NuScenesBox 7 | from os import path as osp 8 | 9 | from ..core.indoor_eval import indoor_eval_ov 10 | 11 | import mmdet3d 12 | #from mmdet.datasets import DATASETS 13 | from mmdet3d.datasets import DATASETS 14 | from mmdet3d.core import show_result 15 | from mmdet3d.core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes 16 | from mmdet3d.datasets import SUNRGBDDataset 17 | 18 | __mmdet3d_version__ = float(mmdet3d.__version__[:3]) 19 | 20 | @DATASETS.register_module() 21 | class SUNRGBDDataset_OV(SUNRGBDDataset): 22 | 23 | def __init__(self, 24 | data_root, 25 | ann_file, 26 | pipeline=None, 27 | classes=None, 28 | seen_classes=None, 29 | modality=dict(use_camera=True, use_lidar=True), 30 | box_type_3d='Depth', 31 | filter_empty_gt=True, 32 | test_mode=False, 33 | **kwargs): 34 | super().__init__( 35 | data_root=data_root, 36 | ann_file=ann_file, 37 | pipeline=pipeline, 38 | classes=classes, 39 | modality=modality, 40 | box_type_3d=box_type_3d, 41 | filter_empty_gt=filter_empty_gt, 42 | test_mode=test_mode, 43 | **kwargs) 44 | 45 | self.seen_classes = seen_classes 46 | self.classes = seen_classes 47 | 48 | def evaluate(self, 49 | results, 50 | metric=None, 51 | iou_thr=(0.25, 0.5), 52 | iou_thr_2d=(0.25, 0.5), 53 | logger=None, 54 | show=False, 55 | out_dir=None, 56 | pipeline=None, 57 | axis_aligned_lw=False): 58 | """Evaluate. 59 | 60 | Evaluation in indoor protocol. 61 | 62 | Args: 63 | results (list[dict]): List of results. 64 | metric (str | list[str], optional): Metrics to be evaluated. 65 | Default: None. 66 | iou_thr (list[float], optional): AP IoU thresholds for 3D 67 | evaluation. Default: (0.25, 0.5). 68 | iou_thr_2d (list[float], optional): AP IoU thresholds for 2D 69 | evaluation. Default: (0.5, ). 70 | show (bool, optional): Whether to visualize. 71 | Default: False. 72 | out_dir (str, optional): Path to save the visualization results. 73 | Default: None. 74 | pipeline (list[dict], optional): raw data loading for showing. 75 | Default: None. 76 | 77 | Returns: 78 | dict: Evaluation results. 79 | """ 80 | assert isinstance( 81 | results, list), f'Expect results to be list, got {type(results)}.' 82 | assert len(results) > 0, 'Expect length of results > 0.' 83 | assert len(results) == len(self.data_infos) 84 | assert isinstance( 85 | results[0], dict 86 | ), f'Expect elements in results to be dict, got {type(results[0])}.' 87 | gt_annos = [info['annos'] for info in self.data_infos] 88 | label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)} 89 | ret_dict = indoor_eval_ov( 90 | self.seen_classes, 91 | gt_annos, 92 | results, 93 | iou_thr, 94 | label2cat, 95 | logger=logger, 96 | box_type_3d=self.box_type_3d, 97 | box_mode_3d=self.box_mode_3d, 98 | axis_aligned_lw=axis_aligned_lw) 99 | if show: 100 | self.show(results, out_dir, pipeline=pipeline) 101 | 102 | return ret_dict 103 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .vovnet import VoVNet 2 | from .second_3d import SECOND3D 3 | 4 | __all__ = ['VoVNet', 'SECOND3D'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/backbones/second_3d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from symbol import import_from 3 | import warnings 4 | from mmcv.cnn import build_conv_layer, build_norm_layer 5 | from mmcv.runner import BaseModule 6 | from torch import nn as nn 7 | 8 | from mmdet.models import BACKBONES 9 | 10 | 11 | @BACKBONES.register_module() 12 | class SECOND3D(BaseModule): 13 | """Modified Backbone network for SECOND. 14 | 15 | Args: 16 | in_channels (int): Input channels. 17 | out_channels (list[int]): Output channels for multi-scale feature maps. 18 | layer_nums (list[int]): Number of layers in each stage. 19 | layer_strides (list[int]): Strides of each stage. 20 | norm_cfg (dict): Config dict of normalization layers. 21 | conv_cfg (dict): Config dict of convolutional layers. 22 | """ 23 | 24 | def __init__(self, 25 | in_channels=128, 26 | out_channels=[128, 128, 256], 27 | layer_nums=[3, 5, 5], 28 | layer_strides=[2, 2, 2], 29 | is_cascade=True, 30 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 31 | conv_cfg=dict(type='Conv3d', bias=False), 32 | init_cfg=None, 33 | pretrained=None): 34 | super(SECOND3D, self).__init__(init_cfg=init_cfg) 35 | assert len(layer_strides) == len(layer_nums) 36 | assert len(out_channels) == len(layer_nums) 37 | 38 | if isinstance(in_channels, list): 39 | in_filters = in_channels 40 | else: 41 | in_filters = [in_channels, *out_channels[:-1]] 42 | # note that when stride > 1, conv2d with same padding isn't 43 | # equal to pad-conv2d. we should use pad-conv2d. 44 | blocks = [] 45 | self.is_cascade = is_cascade 46 | self.kernel_type = conv_cfg.type 47 | if "kernel" in conv_cfg: 48 | kernel = conv_cfg.pop("kernel") 49 | else: 50 | kernel = (1,3,3) 51 | padding = tuple([(_kernel-1)//2 for _kernel in kernel]) 52 | for i, layer_num in enumerate(layer_nums): 53 | block = [ 54 | build_conv_layer( 55 | conv_cfg, 56 | in_filters[i], 57 | out_channels[i], 58 | kernel, 59 | stride=(1,layer_strides[i],layer_strides[i]) if len(padding)==3 else (layer_strides[i],layer_strides[i]), 60 | padding=padding), 61 | build_norm_layer(norm_cfg, out_channels[i])[1], 62 | nn.ReLU(inplace=True), 63 | ] 64 | for j in range(layer_num): 65 | block.append( 66 | build_conv_layer( 67 | conv_cfg, 68 | out_channels[i], 69 | out_channels[i], 70 | kernel, 71 | padding=padding)) 72 | block.append(build_norm_layer(norm_cfg, out_channels[i])[1]) 73 | block.append(nn.ReLU(inplace=True)) 74 | 75 | block = nn.Sequential(*block) 76 | blocks.append(block) 77 | 78 | self.blocks = nn.ModuleList(blocks) 79 | 80 | assert not (init_cfg and pretrained), \ 81 | 'init_cfg and pretrained cannot be setting at the same time' 82 | if isinstance(pretrained, str): 83 | warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 84 | 'please use "init_cfg" instead') 85 | self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) 86 | else: 87 | self.init_cfg = dict(type='Kaiming', layer=self.kernel_type) 88 | 89 | def forward(self, x): 90 | """Forward function. 91 | 92 | Args: 93 | x (torch.Tensor): Input with shape (N, C, H, W). 94 | 95 | Returns: 96 | tuple[torch.Tensor]: Multi-scale features. 97 | """ 98 | outs = [] 99 | batch = x.shape[0] 100 | if self.kernel_type == "Conv2d": 101 | x = x.transpose(1,2).flatten(0,1) 102 | 103 | for i in range(len(self.blocks)): 104 | if self.is_cascade: 105 | x = self.blocks[i](x) 106 | outs.append(x) 107 | else: 108 | out = self.blocks[i](x) 109 | outs.append(out) 110 | 111 | if self.kernel_type == "Conv2d": 112 | outs = [_out.reshape(batch, -1, *_out.shape[-3:]).transpose(1,2) for _out in outs] 113 | 114 | return tuple(outs) 115 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .uni3detr_head import Uni3DETRHead 2 | from .uni3detr_head_clip import Uni3DETRHeadCLIP 3 | 4 | __all__ = ['Uni3DETRHead', 'Uni3DETRHeadCLIP'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .uni3detr import Uni3DETR 2 | from .ov_uni3detr import OV_Uni3DETR 3 | 4 | __all__ = ['Uni3DETR', 'OV_Uni3DETR'] 5 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .rdiouloss import RDIoULoss, IoU3DLoss, SoftFocalLoss 2 | 3 | __all__ = ['RDIoULoss', 'IoU3DLoss', 'SoftFocalLoss'] 4 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/losses/rdiouloss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from projects.mmdet3d_plugin.core.bbox.util import get_rdiou 4 | from torch import nn as nn 5 | import torch.nn.functional as F 6 | 7 | from mmdet.models.losses.utils import weighted_loss 8 | from mmdet.models.losses.utils import weight_reduce_loss 9 | from mmdet.models import LOSSES 10 | from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d, bbox_overlaps_nearest_3d 11 | 12 | @weighted_loss 13 | def rd_iou_loss(pred, target): 14 | """Calculate the IoU loss (1-IoU) of two sets of rotated bounding boxes. 15 | Note that predictions and targets are one-to-one corresponded. 16 | 17 | Args: 18 | pred (torch.Tensor): Bbox predictions with shape [N, 7] 19 | (x, y, z, w, l, h, alpha). 20 | target (torch.Tensor): Bbox targets (gt) with shape [N, 7] 21 | (x, y, z, w, l, h, alpha). 22 | 23 | Returns: 24 | torch.Tensor: IoU loss between predictions and targets. 25 | """ 26 | u, rdiou = get_rdiou(pred.unsqueeze(0), target.unsqueeze(0)) 27 | u, rdiou = u[0], rdiou[0] 28 | 29 | rdiou_loss_n = rdiou - u 30 | rdiou_loss_n = torch.clamp(rdiou_loss_n,min=-1.0,max = 1.0) 31 | rdiou_loss_n = 1 - rdiou_loss_n 32 | return rdiou_loss_n 33 | 34 | 35 | @LOSSES.register_module() 36 | class RDIoULoss(nn.Module): 37 | """Calculate the IoU loss (1-IoU) of rotated bounding boxes. 38 | 39 | Args: 40 | reduction (str): Method to reduce losses. 41 | The valid reduction method are none, sum or mean. 42 | loss_weight (float, optional): Weight of loss. Defaults to 1.0. 43 | """ 44 | 45 | def __init__(self, reduction='mean', loss_weight=1.0): 46 | super().__init__() 47 | self.reduction = reduction 48 | self.loss_weight = loss_weight 49 | 50 | def forward(self, 51 | pred, 52 | target, 53 | weight=None, 54 | avg_factor=None, 55 | reduction_override=None, 56 | **kwargs): 57 | """Forward function of loss calculation. 58 | 59 | Args: 60 | pred (torch.Tensor): Bbox predictions with shape [..., 7] 61 | (x, y, z, w, l, h, alpha). 62 | target (torch.Tensor): Bbox targets (gt) with shape [..., 7] 63 | (x, y, z, w, l, h, alpha). 64 | weight (torch.Tensor | float, optional): Weight of loss. 65 | Defaults to None. 66 | avg_factor (int, optional): Average factor that is used to average 67 | the loss. Defaults to None. 68 | reduction_override (str, optional): Method to reduce losses. 69 | The valid reduction method are 'none', 'sum' or 'mean'. 70 | Defaults to None. 71 | 72 | Returns: 73 | torch.Tensor: IoU loss between predictions and targets. 74 | """ 75 | if weight is not None and not torch.any(weight > 0): 76 | return pred.sum() * weight.sum() # 0 77 | assert reduction_override in (None, 'none', 'mean', 'sum') 78 | reduction = ( 79 | reduction_override if reduction_override else self.reduction) 80 | if weight is not None and weight.dim() > 1: 81 | weight = weight.mean(-1) 82 | loss = self.loss_weight * rd_iou_loss( 83 | pred, 84 | target, 85 | weight, 86 | reduction=reduction, 87 | avg_factor=avg_factor, 88 | **kwargs) 89 | 90 | return loss 91 | 92 | 93 | @weighted_loss 94 | def iou3d_loss(pred, target): 95 | #iou3d = bbox_overlaps_3d(pred, target, coordinate='depth') 96 | #iou3d = 1 - torch.diag(iou3d) 97 | 98 | #iou3d = (1 - bbox_overlaps_nearest_3d(pred, target, is_aligned=True, coordinate='depth') ) 99 | iou3d = (1 - bbox_overlaps_nearest_3d(pred, target, is_aligned=True, coordinate='lidar') ) 100 | #iou3d += (1 - bbox_overlaps_nearest_3d(pred[:, [0,2,1,3,5,4,6]], target[:, [0,2,1,3,5,4,6]], is_aligned=True, coordinate='depth') ) * 0.1 101 | #iou3d += (1 - bbox_overlaps_nearest_3d(pred[:, [1,2,0,4,5,3,6]], target[:, [1,2,0,4,5,3,6]], is_aligned=True, coordinate='depth') ) * 0.1 102 | return iou3d 103 | 104 | 105 | @LOSSES.register_module() 106 | class IoU3DLoss(nn.Module): 107 | """Calculate the IoU loss (1-IoU) of rotated bounding boxes. 108 | 109 | Args: 110 | reduction (str): Method to reduce losses. 111 | The valid reduction method are none, sum or mean. 112 | loss_weight (float, optional): Weight of loss. Defaults to 1.0. 113 | """ 114 | 115 | def __init__(self, reduction='mean', loss_weight=1.0): 116 | super().__init__() 117 | self.reduction = reduction 118 | self.loss_weight = loss_weight 119 | 120 | def forward(self, 121 | pred, 122 | target, 123 | weight=None, 124 | avg_factor=None, 125 | reduction_override=None, 126 | **kwargs): 127 | """Forward function of loss calculation. 128 | 129 | Args: 130 | pred (torch.Tensor): Bbox predictions with shape [..., 7] 131 | (x, y, z, w, l, h, alpha). 132 | target (torch.Tensor): Bbox targets (gt) with shape [..., 7] 133 | (x, y, z, w, l, h, alpha). 134 | weight (torch.Tensor | float, optional): Weight of loss. 135 | Defaults to None. 136 | avg_factor (int, optional): Average factor that is used to average 137 | the loss. Defaults to None. 138 | reduction_override (str, optional): Method to reduce losses. 139 | The valid reduction method are 'none', 'sum' or 'mean'. 140 | Defaults to None. 141 | 142 | Returns: 143 | torch.Tensor: IoU loss between predictions and targets. 144 | """ 145 | if weight is not None and not torch.any(weight > 0): 146 | return pred.sum() * weight.sum() # 0 147 | assert reduction_override in (None, 'none', 'mean', 'sum') 148 | reduction = ( 149 | reduction_override if reduction_override else self.reduction) 150 | if weight is not None and weight.dim() > 1: 151 | weight = weight.mean(-1) 152 | loss = self.loss_weight * iou3d_loss( 153 | pred, 154 | target, 155 | weight, 156 | reduction=reduction, 157 | avg_factor=avg_factor, 158 | **kwargs) 159 | 160 | return loss 161 | 162 | def soft_focal_loss(pred, 163 | target, 164 | weight=None, 165 | gamma=2.0, 166 | alpha=0.25, 167 | reduction='mean', 168 | avg_factor=None): 169 | pred_sigmoid = pred.sigmoid() 170 | 171 | target, target_score = target[0], target[1] 172 | target_oh = torch.zeros((pred_sigmoid.shape[0], pred.shape[1] + 1)).type_as(pred).to(pred.device) 173 | target_oh.scatter_(1, target[:,None], 1) 174 | target_oh = target_oh[:,0:-1] 175 | target = target[:,None] 176 | 177 | target_soft = (target_oh > 0).float() * target_score[:,None] 178 | pt = target_soft - pred_sigmoid 179 | focal_weight = ((1 - alpha) + (2*alpha - 1) * target_soft) * pt.pow(gamma) 180 | loss = F.binary_cross_entropy_with_logits(pred, target_soft, reduction='none') * focal_weight 181 | 182 | weight = weight.view(-1,1) 183 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 184 | return loss 185 | 186 | @LOSSES.register_module() 187 | class SoftFocalLoss(nn.Module): 188 | 189 | def __init__(self, 190 | use_sigmoid=True, 191 | gamma=2.0, 192 | alpha=0.25, 193 | reduction='mean', 194 | loss_weight=1.0): 195 | super(SoftFocalLoss, self).__init__() 196 | assert use_sigmoid is True, 'Only sigmoid focal loss supported now.' 197 | self.use_sigmoid = use_sigmoid 198 | self.gamma = gamma 199 | self.alpha = alpha 200 | self.reduction = reduction 201 | self.loss_weight = loss_weight 202 | 203 | def forward(self, 204 | pred, 205 | target, 206 | weight=None, 207 | avg_factor=None, 208 | reduction_override=None): 209 | assert reduction_override in (None, 'none', 'mean', 'sum') 210 | reduction = ( 211 | reduction_override if reduction_override else self.reduction) 212 | if self.use_sigmoid: 213 | loss_cls = self.loss_weight * soft_focal_loss( 214 | pred, 215 | target, 216 | weight, 217 | gamma=self.gamma, 218 | alpha=self.alpha, 219 | reduction=reduction, 220 | avg_factor=avg_factor) 221 | else: 222 | raise NotImplementedError 223 | return loss_cls -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .second3d_fpn import SECOND3DFPN 3 | 4 | __all__ = ['SECOND3DFPN'] 5 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/necks/second3d_fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import numpy as np 3 | import torch 4 | from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer 5 | from mmcv.runner import BaseModule, auto_fp16 6 | from torch import nn as nn 7 | 8 | from mmdet.models import NECKS 9 | 10 | 11 | @NECKS.register_module() 12 | class SECOND3DFPN(BaseModule): 13 | """Modified FPN used in SECOND. 14 | 15 | Args: 16 | in_channels (list[int]): Input channels of multi-scale feature maps. 17 | out_channels (list[int]): Output channels of feature maps. 18 | upsample_strides (list[int]): Strides used to upsample the 19 | feature maps. 20 | norm_cfg (dict): Config dict of normalization layers. 21 | upsample_cfg (dict): Config dict of upsample layers. 22 | conv_cfg (dict): Config dict of conv layers. 23 | use_conv_for_no_stride (bool): Whether to use conv when stride is 1. 24 | use_for_distill (bool): Whether to use for cross-modality distillation. 25 | """ 26 | 27 | def __init__(self, 28 | in_channels=[128, 128, 256], 29 | out_channels=[256, 256, 256], 30 | upsample_strides=[1, 2, 4], 31 | norm_cfg=dict(type='BN3d', eps=1e-3, momentum=0.01), 32 | upsample_cfg=dict(type='deconv3d', bias=False), 33 | conv_cfg=dict(type='Conv3d', bias=False), 34 | extra_conv=None, 35 | use_conv_for_no_stride=False, 36 | use_for_distill=False, 37 | init_cfg=None): 38 | # if for GroupNorm, 39 | # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True) 40 | super(SECOND3DFPN, self).__init__(init_cfg=init_cfg) 41 | assert len(out_channels) == len(upsample_strides) == len(in_channels) 42 | self.in_channels = in_channels 43 | self.out_channels = out_channels 44 | self.extra_conv = extra_conv 45 | self.fp16_enabled = False 46 | self.use_for_distill = use_for_distill 47 | 48 | deblocks = [] 49 | for i, out_channel in enumerate(out_channels): 50 | stride = upsample_strides[i] 51 | if stride > 1 or (stride == 1 and not use_conv_for_no_stride): 52 | upsample_layer = build_upsample_layer( 53 | upsample_cfg, 54 | in_channels=in_channels[i], 55 | out_channels=out_channel, 56 | kernel_size=(1,stride,stride) if '3d' in upsample_cfg['type'] else (stride,stride), 57 | stride=(1,stride,stride) if '3d' in upsample_cfg['type'] else (stride,stride)) 58 | else: 59 | stride = np.round(1 / stride).astype(np.int64) 60 | upsample_layer = build_conv_layer( 61 | conv_cfg, 62 | in_channels=in_channels[i], 63 | out_channels=out_channel, 64 | kernel_size=(1,stride,stride) if '3d' in conv_cfg['type'] else (stride,stride), 65 | stride=(1,stride,stride) if '3d' in conv_cfg['type'] else (stride,stride)) 66 | 67 | deblock = nn.Sequential(upsample_layer, 68 | build_norm_layer(norm_cfg, out_channel)[1], 69 | nn.ReLU(inplace=True)) 70 | deblocks.append(deblock) 71 | self.deblocks = nn.ModuleList(deblocks) 72 | 73 | if self.extra_conv is not None: 74 | extra_blocks = [] 75 | self.layer_num = self.extra_conv.pop('num_conv') 76 | if "kernel" in self.extra_conv: 77 | kernel = self.extra_conv.pop("kernel") 78 | else: 79 | kernel = (3,3,3) 80 | padding = tuple([(_k-1)//2 for _k in kernel]) 81 | if "sep_kernel" in self.extra_conv: 82 | sep_kernel = self.extra_conv.pop("sep_kernel") 83 | sep_padding = tuple([(_k-1)//2 for _k in sep_kernel]) 84 | else: 85 | sep_kernel = None 86 | for j in range(self.layer_num): 87 | extra_blocks.append( 88 | build_conv_layer( 89 | self.extra_conv, 90 | out_channels[-1], 91 | out_channels[-1], 92 | kernel, 93 | padding=padding)) 94 | if sep_kernel: 95 | extra_blocks.append( 96 | build_conv_layer( 97 | self.extra_conv, 98 | out_channels[-1], 99 | out_channels[-1], 100 | sep_kernel, 101 | padding=sep_padding)) 102 | extra_blocks.append(build_norm_layer(norm_cfg, out_channels[-1])[1]) 103 | extra_blocks.append(nn.ReLU(inplace=True)) 104 | self.extra_blocks = nn.Sequential(*extra_blocks) 105 | 106 | if init_cfg is None: 107 | self.init_cfg = [ 108 | dict(type='Kaiming', layer='ConvTranspose2d'), 109 | dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0) 110 | ] 111 | 112 | @auto_fp16() 113 | def forward(self, x): 114 | """Forward function. 115 | 116 | Args: 117 | x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. 118 | 119 | Returns: 120 | list[torch.Tensor]: Multi-level feature maps. 121 | """ 122 | assert len(x) == len(self.in_channels) 123 | ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)] 124 | 125 | if len(ups) > 1: 126 | out = sum(ups) 127 | else: 128 | out = ups[0] 129 | 130 | if self.extra_conv is not None: 131 | if self.use_for_distill: 132 | out_final = out 133 | before_relu_list = [] 134 | for _idx in range(self.layer_num): 135 | out_mid = self.extra_blocks[_idx*3:(_idx+1)*3-1](out_final) 136 | out_before_relu = out_mid.clone() 137 | out_final = self.extra_blocks[(_idx+1)*3-1](out_mid) 138 | before_relu_list.append(out_before_relu) 139 | 140 | out = {'final':out_final, 'before_relu':before_relu_list} 141 | else: 142 | out = self.extra_blocks(out) 143 | return out 144 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/pts_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_encoder_hd import SparseEncoderHD 2 | 3 | __all__ = ['SparseEncoderHD'] -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/pts_encoder/sparse_encoder_hd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmcv.runner import auto_fp16 3 | from torch import nn as nn 4 | 5 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule 6 | from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE 7 | from mmdet3d.models.builder import MIDDLE_ENCODERS 8 | 9 | if IS_SPCONV2_AVAILABLE: 10 | from spconv.pytorch import SparseConvTensor, SparseSequential 11 | else: 12 | from mmcv.ops import SparseConvTensor, SparseSequential 13 | 14 | @MIDDLE_ENCODERS.register_module() 15 | class SparseEncoderHD(nn.Module): 16 | r"""Sparse encoder for SECOND and Part-A2. 17 | 18 | Args: 19 | in_channels (int): The number of input channels. 20 | sparse_shape (list[int]): The sparse shape of input tensor. 21 | order (list[str]): Order of conv module. Defaults to ('conv', 22 | 'norm', 'act'). 23 | norm_cfg (dict): Config of normalization layer. Defaults to 24 | dict(type='BN1d', eps=1e-3, momentum=0.01). 25 | base_channels (int): Out channels for conv_input layer. 26 | Defaults to 16. 27 | output_channels (int): Out channels for conv_out layer. 28 | Defaults to 128. 29 | encoder_channels (tuple[tuple[int]]): 30 | Convolutional channels of each encode block. 31 | encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. 32 | Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). 33 | block_type (str): Type of the block to use. Defaults to 'conv_module'. 34 | """ 35 | 36 | def __init__(self, 37 | in_channels, 38 | sparse_shape, 39 | order=('conv', 'norm', 'act'), 40 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 41 | base_channels=16, 42 | output_channels=128, 43 | encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 44 | 64)), 45 | encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 46 | 1)), 47 | encoder_strides=(2, 2, 2, 1), 48 | block_type='conv_module', 49 | keep_depth=True, 50 | fp16_enabled=False): 51 | super().__init__() 52 | assert block_type in ['conv_module', 'basicblock'] 53 | self.sparse_shape = sparse_shape 54 | self.in_channels = in_channels 55 | self.order = order 56 | self.base_channels = base_channels 57 | self.output_channels = output_channels 58 | self.encoder_channels = encoder_channels 59 | self.encoder_paddings = encoder_paddings 60 | self.encoder_strides = encoder_strides 61 | self.stage_num = len(self.encoder_channels) 62 | self.keep_depth = keep_depth 63 | if fp16_enabled: 64 | self.fp16_enabled = fp16_enabled 65 | # Spconv init all weight on its own 66 | 67 | assert isinstance(order, tuple) and len(order) == 3 68 | assert set(order) == {'conv', 'norm', 'act'} 69 | 70 | if self.order[0] != 'conv': # pre activate 71 | self.conv_input = make_sparse_convmodule( 72 | in_channels, 73 | self.base_channels, 74 | 3, 75 | norm_cfg=norm_cfg, 76 | padding=1, 77 | indice_key='subm1', 78 | conv_type='SubMConv3d', 79 | order=('conv', )) 80 | else: # post activate 81 | self.conv_input = make_sparse_convmodule( 82 | in_channels, 83 | self.base_channels, 84 | 3, 85 | norm_cfg=norm_cfg, 86 | padding=1, 87 | indice_key='subm1', 88 | conv_type='SubMConv3d') 89 | 90 | encoder_out_channels = self.make_encoder_layers( 91 | make_sparse_convmodule, 92 | norm_cfg, 93 | self.base_channels, 94 | block_type=block_type) 95 | 96 | self.conv_out = make_sparse_convmodule( 97 | encoder_out_channels, 98 | self.output_channels, 99 | kernel_size=(1, 1, 1), 100 | stride=(1, 1, 1), 101 | norm_cfg=norm_cfg, 102 | padding=0, 103 | indice_key='spconv_down2', 104 | conv_type='SparseConv3d') 105 | 106 | @auto_fp16(apply_to=('voxel_features', )) 107 | def forward(self, voxel_features, coors, batch_size): 108 | """Forward of SparseEncoder. 109 | 110 | Args: 111 | voxel_features (torch.float32): Voxel features in shape (N, C). 112 | coors (torch.int32): Coordinates in shape (N, 4), \ 113 | the columns in the order of (batch_idx, z_idx, y_idx, x_idx). 114 | batch_size (int): Batch size. 115 | 116 | Returns: 117 | dict: Backbone features. 118 | """ 119 | coors = coors.int() 120 | input_sp_tensor = SparseConvTensor(voxel_features, coors, 121 | self.sparse_shape, 122 | batch_size) 123 | x = self.conv_input(input_sp_tensor) 124 | 125 | encode_features = [] 126 | for encoder_layer in self.encoder_layers: 127 | x = encoder_layer(x) 128 | encode_features.append(x) 129 | 130 | # for detection head 131 | # [200, 176, 5] -> [200, 176, 5] 132 | out = self.conv_out(encode_features[-1]) 133 | spatial_features = out.dense() 134 | 135 | if not self.keep_depth: 136 | spatial_features = spatial_features.sum(dim=2) 137 | 138 | return spatial_features 139 | 140 | def make_encoder_layers(self, 141 | make_block, 142 | norm_cfg, 143 | in_channels, 144 | block_type='conv_module', 145 | conv_cfg=dict(type='SubMConv3d')): 146 | """make encoder layers using sparse convs. 147 | 148 | Args: 149 | make_block (method): A bounded function to build blocks. 150 | norm_cfg (dict[str]): Config of normalization layer. 151 | in_channels (int): The number of encoder input channels. 152 | block_type (str): Type of the block to use. Defaults to 153 | 'conv_module'. 154 | conv_cfg (dict): Config of conv layer. Defaults to 155 | dict(type='SubMConv3d'). 156 | 157 | Returns: 158 | int: The number of encoder output channels. 159 | """ 160 | assert block_type in ['conv_module', 'basicblock'] 161 | self.encoder_layers = SparseSequential() 162 | 163 | for i, blocks in enumerate(self.encoder_channels): 164 | blocks_list = [] 165 | for j, out_channels in enumerate(tuple(blocks)): 166 | padding = tuple(self.encoder_paddings[i])[j] 167 | # each stage started with a spconv layer 168 | # except the first stage 169 | if i != 0 and j == 0 and block_type == 'conv_module': 170 | blocks_list.append( 171 | make_block( 172 | in_channels, 173 | out_channels, 174 | 3, 175 | norm_cfg=norm_cfg, 176 | stride=self.encoder_strides[i], 177 | padding=padding, 178 | indice_key=f'spconv{i + 1}', 179 | conv_type='SparseConv3d')) 180 | elif block_type == 'basicblock': 181 | if j == len(blocks) - 1 and i != len( 182 | self.encoder_channels) - 1: 183 | blocks_list.append( 184 | make_block( 185 | in_channels, 186 | out_channels, 187 | 3, 188 | norm_cfg=norm_cfg, 189 | stride=self.encoder_strides[i], 190 | padding=padding, 191 | indice_key=f'spconv{i + 1}', 192 | conv_type='SparseConv3d')) 193 | else: 194 | blocks_list.append( 195 | SparseBasicBlock( 196 | out_channels, 197 | out_channels, 198 | norm_cfg=norm_cfg, 199 | conv_cfg=conv_cfg)) 200 | else: 201 | blocks_list.append( 202 | make_block( 203 | in_channels, 204 | out_channels, 205 | 3, 206 | norm_cfg=norm_cfg, 207 | padding=padding, 208 | indice_key=f'subm{i + 1}', 209 | conv_type='SubMConv3d')) 210 | in_channels = out_channels 211 | stage_name = f'encoder_layer{i + 1}' 212 | stage_layers = SparseSequential(*blocks_list) 213 | self.encoder_layers.add_module(stage_name, stage_layers) 214 | return out_channels 215 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .uni3detr_transformer import Uni3DETRTransformer, Uni3DETRTransformerDecoder, UniCrossAtten 2 | from .uni3d_viewtrans import Uni3DViewTrans 3 | 4 | __all__ = ['Uni3DETRTransformer', 'Uni3DETRTransformerDecoder', 'UniCrossAtten', 'Uni3DViewTrans'] 5 | -------------------------------------------------------------------------------- /projects/mmdet3d_plugin/models/utils/grid_mask.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from PIL import Image 5 | 6 | class Grid(object): 7 | def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): 8 | self.use_h = use_h 9 | self.use_w = use_w 10 | self.rotate = rotate 11 | self.offset = offset 12 | self.ratio = ratio 13 | self.mode=mode 14 | self.st_prob = prob 15 | self.prob = prob 16 | 17 | def set_prob(self, epoch, max_epoch): 18 | self.prob = self.st_prob * epoch / max_epoch 19 | 20 | def __call__(self, img, label): 21 | if np.random.rand() > self.prob: 22 | return img, label 23 | h = img.size(1) 24 | w = img.size(2) 25 | self.d1 = 2 26 | self.d2 = min(h, w) 27 | hh = int(1.5*h) 28 | ww = int(1.5*w) 29 | d = np.random.randint(self.d1, self.d2) 30 | if self.ratio == 1: 31 | self.l = np.random.randint(1, d) 32 | else: 33 | self.l = min(max(int(d*self.ratio+0.5),1),d-1) 34 | mask = np.ones((hh, ww), np.float32) 35 | st_h = np.random.randint(d) 36 | st_w = np.random.randint(d) 37 | if self.use_h: 38 | for i in range(hh//d): 39 | s = d*i + st_h 40 | t = min(s+self.l, hh) 41 | mask[s:t,:] *= 0 42 | if self.use_w: 43 | for i in range(ww//d): 44 | s = d*i + st_w 45 | t = min(s+self.l, ww) 46 | mask[:,s:t] *= 0 47 | 48 | r = np.random.randint(self.rotate) 49 | mask = Image.fromarray(np.uint8(mask)) 50 | mask = mask.rotate(r) 51 | mask = np.asarray(mask) 52 | mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] 53 | 54 | mask = torch.from_numpy(mask).float() 55 | if self.mode == 1: 56 | mask = 1-mask 57 | 58 | mask = mask.expand_as(img) 59 | if self.offset: 60 | offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() 61 | offset = (1 - mask) * offset 62 | img = img * mask + offset 63 | else: 64 | img = img * mask 65 | 66 | return img, label 67 | 68 | 69 | class GridMask(nn.Module): 70 | def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): 71 | super(GridMask, self).__init__() 72 | self.use_h = use_h 73 | self.use_w = use_w 74 | self.rotate = rotate 75 | self.offset = offset 76 | self.ratio = ratio 77 | self.mode = mode 78 | self.st_prob = prob 79 | self.prob = prob 80 | 81 | def set_prob(self, epoch, max_epoch): 82 | self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 83 | 84 | def forward(self, x): 85 | if np.random.rand() > self.prob or not self.training: 86 | return x 87 | n,c,h,w = x.size() 88 | x = x.view(-1,h,w) 89 | hh = int(1.5*h) 90 | ww = int(1.5*w) 91 | d = np.random.randint(2, h) 92 | self.l = min(max(int(d*self.ratio+0.5),1),d-1) 93 | mask = np.ones((hh, ww), np.float32) 94 | st_h = np.random.randint(d) 95 | st_w = np.random.randint(d) 96 | if self.use_h: 97 | for i in range(hh//d): 98 | s = d*i + st_h 99 | t = min(s+self.l, hh) 100 | mask[s:t,:] *= 0 101 | if self.use_w: 102 | for i in range(ww//d): 103 | s = d*i + st_w 104 | t = min(s+self.l, ww) 105 | mask[:,s:t] *= 0 106 | 107 | r = np.random.randint(self.rotate) 108 | mask = Image.fromarray(np.uint8(mask)) 109 | mask = mask.rotate(r) 110 | mask = np.asarray(mask) 111 | mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] 112 | 113 | mask = torch.from_numpy(mask).float().cuda() 114 | if self.mode == 1: 115 | mask = 1-mask 116 | mask = mask.expand_as(x) 117 | if self.offset: 118 | offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float().cuda() 119 | x = x * mask + offset * (1 - mask) 120 | else: 121 | x = x * mask 122 | 123 | return x.view(n,c,h,w) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/optional.txt 3 | -r requirements/runtime.txt 4 | -r requirements/tests.txt 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [yapf] 2 | BASED_ON_STYLE = pep8 3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 5 | 6 | [isort] 7 | line_length = 79 8 | multi_line_output = 0 9 | extra_standard_library = setuptools 10 | known_first_party = mmdet,mmseg,mmdet3d 11 | known_third_party = cv2,imageio,indoor3d_util,load_scannet_data,lyft_dataset_sdk,m2r,matplotlib,mmcv,nuimages,numba,numpy,nuscenes,pandas,plyfile,pycocotools,pyquaternion,pytest,pytorch_sphinx_theme,recommonmark,requests,scannet_utils,scipy,seaborn,shapely,skimage,sphinx,tensorflow,terminaltables,torch,trimesh,ts,waymo_open_dataset 12 | no_lines_before = STDLIB,LOCALFOLDER 13 | default_section = THIRDPARTY 14 | 15 | [codespell] 16 | ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD,warmup 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import shutil 4 | import sys 5 | import warnings 6 | from os import path as osp 7 | from setuptools import find_packages, setup 8 | 9 | import torch 10 | from torch.utils.cpp_extension import (BuildExtension, CppExtension, 11 | CUDAExtension) 12 | 13 | 14 | def readme(): 15 | with open('README.md', encoding='utf-8') as f: 16 | content = f.read() 17 | return content 18 | 19 | 20 | version_file = 'mmdet3d/version.py' 21 | 22 | 23 | def get_version(): 24 | with open(version_file, 'r') as f: 25 | exec(compile(f.read(), version_file, 'exec')) 26 | import sys 27 | 28 | # return short version for sdist 29 | if 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: 30 | return locals()['short_version'] 31 | else: 32 | return locals()['__version__'] 33 | 34 | 35 | def make_cuda_ext(name, 36 | module, 37 | sources, 38 | sources_cuda=[], 39 | extra_args=[], 40 | extra_include_path=[]): 41 | 42 | define_macros = [] 43 | extra_compile_args = {'cxx': [] + extra_args} 44 | 45 | if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': 46 | define_macros += [('WITH_CUDA', None)] 47 | extension = CUDAExtension 48 | extra_compile_args['nvcc'] = extra_args + [ 49 | '-D__CUDA_NO_HALF_OPERATORS__', 50 | '-D__CUDA_NO_HALF_CONVERSIONS__', 51 | '-D__CUDA_NO_HALF2_OPERATORS__', 52 | ] 53 | sources += sources_cuda 54 | else: 55 | print('Compiling {} without CUDA'.format(name)) 56 | extension = CppExtension 57 | # raise EnvironmentError('CUDA is required to compile MMDetection!') 58 | 59 | return extension( 60 | name='{}.{}'.format(module, name), 61 | sources=[os.path.join(*module.split('.'), p) for p in sources], 62 | include_dirs=extra_include_path, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args) 65 | 66 | 67 | def parse_requirements(fname='requirements.txt', with_version=True): 68 | """Parse the package dependencies listed in a requirements file but strips 69 | specific versioning information. 70 | 71 | Args: 72 | fname (str): path to requirements file 73 | with_version (bool, default=False): if True include version specs 74 | 75 | Returns: 76 | list[str]: list of requirements items 77 | 78 | CommandLine: 79 | python -c "import setup; print(setup.parse_requirements())" 80 | """ 81 | import re 82 | import sys 83 | from os.path import exists 84 | require_fpath = fname 85 | 86 | def parse_line(line): 87 | """Parse information from a line in a requirements text file.""" 88 | if line.startswith('-r '): 89 | # Allow specifying requirements in other files 90 | target = line.split(' ')[1] 91 | for info in parse_require_file(target): 92 | yield info 93 | else: 94 | info = {'line': line} 95 | if line.startswith('-e '): 96 | info['package'] = line.split('#egg=')[1] 97 | else: 98 | # Remove versioning from the package 99 | pat = '(' + '|'.join(['>=', '==', '>']) + ')' 100 | parts = re.split(pat, line, maxsplit=1) 101 | parts = [p.strip() for p in parts] 102 | 103 | info['package'] = parts[0] 104 | if len(parts) > 1: 105 | op, rest = parts[1:] 106 | if ';' in rest: 107 | # Handle platform specific dependencies 108 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies 109 | version, platform_deps = map(str.strip, 110 | rest.split(';')) 111 | info['platform_deps'] = platform_deps 112 | else: 113 | version = rest # NOQA 114 | info['version'] = (op, version) 115 | yield info 116 | 117 | def parse_require_file(fpath): 118 | with open(fpath, 'r') as f: 119 | for line in f.readlines(): 120 | line = line.strip() 121 | if line and not line.startswith('#'): 122 | for info in parse_line(line): 123 | yield info 124 | 125 | def gen_packages_items(): 126 | if exists(require_fpath): 127 | for info in parse_require_file(require_fpath): 128 | parts = [info['package']] 129 | if with_version and 'version' in info: 130 | parts.extend(info['version']) 131 | if not sys.version.startswith('3.4'): 132 | # apparently package_deps are broken in 3.4 133 | platform_deps = info.get('platform_deps') 134 | if platform_deps is not None: 135 | parts.append(';' + platform_deps) 136 | item = ''.join(parts) 137 | yield item 138 | 139 | packages = list(gen_packages_items()) 140 | return packages 141 | 142 | 143 | def add_mim_extension(): 144 | """Add extra files that are required to support MIM into the package. 145 | 146 | These files will be added by creating a symlink to the originals if the 147 | package is installed in `editable` mode (e.g. pip install -e .), or by 148 | copying from the originals otherwise. 149 | """ 150 | 151 | # parse installment mode 152 | if 'develop' in sys.argv: 153 | # installed by `pip install -e .` 154 | if platform.system() == 'Windows': 155 | # set `copy` mode here since symlink fails on Windows. 156 | mode = 'copy' 157 | else: 158 | mode = 'symlink' 159 | elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: 160 | # installed by `pip install .` 161 | # or create source distribution by `python setup.py sdist` 162 | mode = 'copy' 163 | else: 164 | return 165 | 166 | filenames = ['tools', 'configs', 'model-index.yml'] 167 | repo_path = osp.dirname(__file__) 168 | mim_path = osp.join(repo_path, 'mmdet3d', '.mim') 169 | os.makedirs(mim_path, exist_ok=True) 170 | 171 | for filename in filenames: 172 | if osp.exists(filename): 173 | src_path = osp.join(repo_path, filename) 174 | tar_path = osp.join(mim_path, filename) 175 | 176 | if osp.isfile(tar_path) or osp.islink(tar_path): 177 | os.remove(tar_path) 178 | elif osp.isdir(tar_path): 179 | shutil.rmtree(tar_path) 180 | 181 | if mode == 'symlink': 182 | src_relpath = osp.relpath(src_path, osp.dirname(tar_path)) 183 | os.symlink(src_relpath, tar_path) 184 | elif mode == 'copy': 185 | if osp.isfile(src_path): 186 | shutil.copyfile(src_path, tar_path) 187 | elif osp.isdir(src_path): 188 | shutil.copytree(src_path, tar_path) 189 | else: 190 | warnings.warn(f'Cannot copy file {src_path}.') 191 | else: 192 | raise ValueError(f'Invalid mode {mode}') 193 | 194 | 195 | if __name__ == '__main__': 196 | add_mim_extension() 197 | setup( 198 | name='mmdet3d', 199 | version=get_version(), 200 | description=("OpenMMLab's next-generation platform" 201 | 'for general 3D object detection.'), 202 | long_description=readme(), 203 | long_description_content_type='text/markdown', 204 | author='MMDetection3D Contributors', 205 | author_email='zwwdev@gmail.com', 206 | keywords='computer vision, 3D object detection', 207 | url='https://github.com/open-mmlab/mmdetection3d', 208 | packages=find_packages(), 209 | include_package_data=True, 210 | package_data={'mmdet3d.ops': ['*/*.so']}, 211 | classifiers=[ 212 | 'Development Status :: 4 - Beta', 213 | 'License :: OSI Approved :: Apache Software License', 214 | 'Operating System :: OS Independent', 215 | 'Programming Language :: Python :: 3', 216 | 'Programming Language :: Python :: 3.6', 217 | 'Programming Language :: Python :: 3.7', 218 | ], 219 | license='Apache License 2.0', 220 | install_requires=parse_requirements('requirements/runtime.txt'), 221 | extras_require={ 222 | 'all': parse_requirements('requirements.txt'), 223 | 'tests': parse_requirements('requirements/tests.txt'), 224 | 'build': parse_requirements('requirements/build.txt'), 225 | 'optional': parse_requirements('requirements/optional.txt'), 226 | 'mim': parse_requirements('requirements/mminstall.txt'), 227 | }, 228 | cmdclass={'build_ext': BuildExtension}, 229 | zip_safe=False) 230 | --------------------------------------------------------------------------------