├── v2xvit ├── __init__.py ├── loss │ ├── __init__.py │ ├── voxel_net_loss.py │ └── pixor_loss.py ├── models │ ├── __init__.py │ ├── fuse_modules │ │ ├── __init__.py │ │ ├── f_cooper_fuse.py │ │ ├── fuse_utils.py │ │ ├── self_attn.py │ │ └── mswin.py │ ├── sub_modules │ │ ├── __init__.py │ │ ├── f_cooper_fuse.py │ │ ├── naive_compress.py │ │ ├── point_pillar_scatter.py │ │ ├── downsample_conv.py │ │ ├── fuse_utils.py │ │ ├── split_attn.py │ │ ├── auto_encoder.py │ │ ├── self_attn.py │ │ ├── psm_mask.py │ │ ├── base_transformer.py │ │ └── mswin.py │ ├── point_pillar_intermediate.py │ ├── point_pillar.py │ ├── point_pillar_fcooper.py │ ├── point_pillar_opv2v.py │ ├── point_pillar_single.py │ ├── point_pillar_cobevt.py │ ├── point_pillar_v2vnet.py │ ├── point_pillar_when2com.py │ └── point_pillar_transformer.py ├── tools │ ├── __init__.py │ ├── loop_inference.py │ ├── debug_utils.py │ └── inference_utils.py ├── utils │ ├── __init__.py │ ├── setup.py │ ├── transformation_utils.py │ ├── pose_utils.py │ ├── eval_utils.py │ ├── common_utils.py │ └── box_overlaps.pyx ├── data_utils │ ├── __init__.py │ ├── augmentor │ │ ├── __init__.py │ │ ├── augment_utils.py │ │ └── data_augmentor.py │ ├── datasets │ │ ├── .intermediate_fusion_dataset.py.swp │ │ └── __init__.py │ ├── post_processor │ │ └── __init__.py │ └── pre_processor │ │ ├── __init__.py │ │ ├── base_preprocessor.py │ │ ├── bev_preprocessor.py │ │ ├── voxel_preprocessor.py │ │ └── sp_voxel_preprocessor.py ├── hypes_yaml │ ├── __init__.py │ ├── visualization.yaml │ ├── point_pillar_early_fusion.yaml │ ├── point_pillar_late_fusion.yaml │ ├── point_pillar_cobevt.yaml │ ├── point_pillar_fcooper.yaml │ ├── point_pillar_v2vnet.yaml │ ├── point_pillar_when2com.yaml │ ├── point_pillar_single.yaml │ ├── point_pillar_opv2v.yaml │ ├── point_pillar_where2comm_ori.yaml │ ├── point_pillar_where2comm_ori_multi.yaml │ ├── point_pillar_where2comm_ori_single.yaml │ ├── where2comm_transformer_multiscale_resnet.yaml │ └── point_pillar_v2xvit.yaml ├── visualization │ ├── __init__.py │ ├── simple_plot3d │ │ └── __init__.py │ ├── vis_data_sequence.py │ └── simple_vis.py └── version.py ├── images └── Overview.png ├── requirements.txt ├── setup.py ├── LICENSE ├── Env.yaml ├── .gitignore ├── docs ├── data_annotation_tutorial.md └── data_intro.md └── README.md /v2xvit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/loss/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/models/fuse_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/data_utils/augmentor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2xvit/data_utils/datasets/.intermediate_fusion_dataset.py.swp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmgu0212/FeaCo/HEAD/images/Overview.png -------------------------------------------------------------------------------- /v2xvit/version.py: -------------------------------------------------------------------------------- 1 | """Specifies the current version number of v2xvit.""" 2 | 3 | __version__ = "0.1.0" 4 | -------------------------------------------------------------------------------- /v2xvit/visualization/simple_plot3d/__init__.py: -------------------------------------------------------------------------------- 1 | from .canvas_3d import Canvas_3D 2 | from .canvas_bev import Canvas_BEV 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | open3d 4 | opencv-python 5 | cython 6 | tensorboardX 7 | shapely 8 | einops 9 | 10 | -------------------------------------------------------------------------------- /v2xvit/tools/loop_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | for index in range(15,40,2): 4 | cmd = f"CUDA_VISIBLE_DEVICES=1 python /home/gaojing/zjy/v2x-vit/v2xvit/tools/inference.py --eval_epoch {index} " 5 | print(f"Running command: {cmd}") 6 | os.system(cmd) -------------------------------------------------------------------------------- /v2xvit/utils/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | import numpy 4 | setup( 5 | name='box overlaps', 6 | ext_modules=cythonize('v2xvit/utils/box_overlaps.pyx'), 7 | include_dirs=[numpy.get_include()] 8 | ) -------------------------------------------------------------------------------- /v2xvit/data_utils/post_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from v2xvit.data_utils.post_processor.voxel_postprocessor import VoxelPostprocessor 2 | from v2xvit.data_utils.post_processor.bev_postprocessor import BevPostprocessor 3 | 4 | __all__ = { 5 | 'VoxelPostprocessor': VoxelPostprocessor, 6 | 'BevPostprocessor': BevPostprocessor, 7 | } 8 | 9 | 10 | def build_postprocessor(anchor_cfg, train): 11 | process_method_name = anchor_cfg['core_method'] 12 | assert process_method_name in ['VoxelPostprocessor', 'BevPostprocessor'] 13 | anchor_generator = __all__[process_method_name]( 14 | anchor_params=anchor_cfg, 15 | train=train 16 | ) 17 | 18 | return anchor_generator 19 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/f_cooper_fuse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of F-cooper maxout fusing. 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class SpatialFusion(nn.Module): 9 | def __init__(self): 10 | super(SpatialFusion, self).__init__() 11 | 12 | def regroup(self, x, record_len): 13 | cum_sum_len = torch.cumsum(record_len, dim=0) 14 | split_x = torch.tensor_split(x, cum_sum_len[:-1].cpu()) 15 | return split_x 16 | 17 | def forward(self, x, record_len): 18 | # x: B, C, H, W, split x:[(B1, C, W, H), (B2, C, W, H)] 19 | split_x = self.regroup(x, record_len) 20 | out = [] 21 | 22 | for xx in split_x: 23 | xx = torch.max(xx, dim=0, keepdim=True)[0] 24 | out.append(xx) 25 | return torch.cat(out, dim=0) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | from os.path import dirname, realpath 7 | from setuptools import setup, find_packages, Distribution 8 | from v2xvit.version import __version__ 9 | 10 | 11 | def _read_requirements_file(): 12 | """Return the elements in requirements.txt.""" 13 | req_file_path = '%s/requirements.txt' % dirname(realpath(__file__)) 14 | with open(req_file_path) as f: 15 | return [line.strip() for line in f] 16 | 17 | 18 | setup( 19 | name='V2XViT', 20 | version=__version__, 21 | packages=find_packages(), 22 | url='https://github.com/ucla-mobility/OpenCDA.git', 23 | license='MIT', 24 | author='Runsheng Xu, Hao Xiang, Zhengzhong Tu', 25 | author_email='rxx3386@ucla.edu', 26 | description='An opensource pytorch framework for autonomous driving ' 27 | 'cooperative detection', 28 | long_description=open("README.md").read(), 29 | install_requires=_read_requirements_file(), 30 | ) 31 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/naive_compress.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class NaiveCompressor(nn.Module): 6 | def __init__(self, input_dim, compress_raito): 7 | super().__init__() 8 | self.encoder = nn.Sequential( 9 | nn.Conv2d(input_dim, input_dim//compress_raito, kernel_size=3, 10 | stride=1, padding=1), 11 | nn.BatchNorm2d(input_dim//compress_raito, eps=1e-3, momentum=0.01), 12 | nn.ReLU() 13 | ) 14 | self.decoder = nn.Sequential( 15 | nn.Conv2d(input_dim//compress_raito, input_dim, kernel_size=3, 16 | stride=1, padding=1), 17 | nn.BatchNorm2d(input_dim, eps=1e-3, momentum=0.01), 18 | nn.ReLU(), 19 | nn.Conv2d(input_dim, input_dim, kernel_size=3, stride=1, padding=1), 20 | nn.BatchNorm2d(input_dim, eps=1e-3, 21 | momentum=0.01), 22 | nn.ReLU() 23 | ) 24 | 25 | def forward(self, x): 26 | x = self.encoder(x) 27 | x = self.decoder(x) 28 | 29 | return x -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Runsheng Xu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /v2xvit/data_utils/pre_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from v2xvit.data_utils.pre_processor.base_preprocessor import BasePreprocessor 2 | from v2xvit.data_utils.pre_processor.voxel_preprocessor import VoxelPreprocessor 3 | from v2xvit.data_utils.pre_processor.bev_preprocessor import BevPreprocessor 4 | from v2xvit.data_utils.pre_processor.sp_voxel_preprocessor import SpVoxelPreprocessor 5 | 6 | __all__ = { 7 | 'BasePreprocessor': BasePreprocessor, 8 | 'VoxelPreprocessor': VoxelPreprocessor, 9 | 'BevPreprocessor': BevPreprocessor, 10 | 'SpVoxelPreprocessor': SpVoxelPreprocessor 11 | } 12 | 13 | 14 | def build_preprocessor(preprocess_cfg, train): 15 | process_method_name = preprocess_cfg['core_method'] 16 | error_message = f"{process_method_name} is not found. " \ 17 | f"Please add your processor file's name in opencood/" \ 18 | f"data_utils/processor/init.py" 19 | assert process_method_name in ['BasePreprocessor', 'VoxelPreprocessor', 20 | 'BevPreprocessor', 'SpVoxelPreprocessor'], \ 21 | error_message 22 | 23 | processor = __all__[process_method_name]( 24 | preprocess_params=preprocess_cfg, 25 | train=train 26 | ) 27 | 28 | return processor 29 | -------------------------------------------------------------------------------- /v2xvit/data_utils/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from v2xvit.data_utils.datasets.late_fusion_dataset import LateFusionDataset 2 | from v2xvit.data_utils.datasets.early_fusion_dataset import EarlyFusionDataset 3 | from v2xvit.data_utils.datasets.intermediate_fusion_dataset import IntermediateFusionDataset 4 | 5 | __all__ = { 6 | 'LateFusionDataset': LateFusionDataset, 7 | 'EarlyFusionDataset': EarlyFusionDataset, 8 | 'IntermediateFusionDataset': IntermediateFusionDataset 9 | } 10 | 11 | # the final range for evaluation 12 | GT_RANGE = [-140, -40, -3, 140, 40, 1] 13 | # The communication range for cavs 14 | COM_RANGE = 70 15 | 16 | 17 | def build_dataset(dataset_cfg, visualize=False, train=True): 18 | dataset_name = dataset_cfg['fusion']['core_method'] 19 | error_message = f"{dataset_name} is not found. " \ 20 | f"Please add your processor file's name in opencood/" \ 21 | f"data_utils/datasets/init.py" 22 | assert dataset_name in ['LateFusionDataset', 'EarlyFusionDataset', 23 | 'IntermediateFusionDataset'], error_message 24 | 25 | dataset = __all__[dataset_name]( 26 | params=dataset_cfg, 27 | visualize=visualize, 28 | train=train 29 | ) 30 | 31 | return dataset 32 | -------------------------------------------------------------------------------- /Env.yaml: -------------------------------------------------------------------------------- 1 | usage: conda-env [-h] {attach,create,export,list,remove,upload,update} ... 2 | 3 | positional arguments: 4 | {attach,create,export,list,remove,upload,update} 5 | attach WARNING: This command is deprecated in conda 4.4 and 6 | scheduled for removal in conda 4.5. Embeds information 7 | describing your conda environment into the notebook 8 | metadata 9 | create Create an environment based on an environment file 10 | export Export a given environment 11 | list List the Conda environments 12 | remove Remove an environment 13 | upload WARNING: This command is deprecated in conda 4.4 and 14 | scheduled for removal in conda 4.5. Upload an 15 | environment to anaconda.org 16 | update Update the current environment based on environment 17 | file 18 | 19 | optional arguments: 20 | -h, --help Show this help message and exit. 21 | 22 | conda commands available from other packages: 23 | build 24 | convert 25 | develop 26 | env 27 | index 28 | inspect 29 | metapackage 30 | render 31 | server 32 | skeleton 33 | -------------------------------------------------------------------------------- /v2xvit/visualization/vis_data_sequence.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | import os 7 | import argparse 8 | from torch.utils.data import DataLoader 9 | 10 | from v2xvit.hypes_yaml.yaml_utils import load_yaml 11 | from v2xvit.visualization import vis_utils 12 | from v2xvit.data_utils.datasets.early_fusion_vis_dataset import \ 13 | EarlyFusionVisDataset 14 | 15 | 16 | def vis_parser(): 17 | parser = argparse.ArgumentParser(description="data visualization") 18 | parser.add_argument('--color_mode', type=str, default="intensity", 19 | help='lidar color rendering mode, e.g. intensity,' 20 | 'z-value or constant.') 21 | opt = parser.parse_args() 22 | return opt 23 | 24 | 25 | if __name__ == '__main__': 26 | current_path = os.path.dirname(os.path.realpath(__file__)) 27 | params = load_yaml(os.path.join(current_path, 28 | '../hypes_yaml/visualization.yaml')) 29 | 30 | opencda_dataset = EarlyFusionVisDataset(params, visualize=True, 31 | train=False) 32 | data_loader = DataLoader(opencda_dataset, batch_size=1, num_workers=8, 33 | collate_fn=opencda_dataset.collate_batch_train, 34 | shuffle=False, 35 | pin_memory=False) 36 | 37 | opt = vis_parser() 38 | vis_utils.visualize_sequence_dataloader(data_loader, 39 | params['postprocess']['order'], 40 | color_mode=opt.color_mode) 41 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/visualization.yaml: -------------------------------------------------------------------------------- 1 | # this yaml is only for visualization 2 | name: visualization 3 | 4 | yaml_parser: "load_voxel_params" 5 | root_dir: 'v2xset/train' 6 | validate_dir: 'v2xset/validate' 7 | 8 | train_params: 9 | batch_size: &batch_size 4 10 | epoches: 100 11 | eval_freq: 1 12 | save_freq: 1 13 | 14 | fusion: 15 | core_method: 'EarlyFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 16 | args: [] 17 | 18 | # preprocess-related 19 | preprocess: 20 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 21 | core_method: 'SpVoxelPreprocessor' 22 | args: 23 | voxel_size: &voxel_size [0.4, 0.4, 0.4] 24 | max_points_per_voxel: &T 32 25 | max_voxel_train: 36000 26 | max_voxel_test: 70000 27 | # lidar range for each individual cav. 28 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] 29 | 30 | data_augment: 31 | - NAME: random_world_flip 32 | ALONG_AXIS_LIST: [ 'x' ] 33 | 34 | - NAME: random_world_rotation 35 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 36 | 37 | - NAME: random_world_scaling 38 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 39 | 40 | # anchor box related 41 | postprocess: 42 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 43 | anchor_args: 44 | cav_lidar_range: *cav_lidar 45 | l: 3.9 46 | w: 1.6 47 | h: 1.56 48 | r: [0, 90] 49 | num: &achor_num 2 50 | target_args: 51 | pos_threshold: 0.6 52 | neg_threshold: 0.45 53 | score_threshold: 0.96 54 | order: 'hwl' # hwl or lwh 55 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 56 | nms_thresh: 0.15 -------------------------------------------------------------------------------- /v2xvit/models/fuse_modules/f_cooper_fuse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | """ 7 | Implementation of F-cooper maxout fusing. 8 | """ 9 | import torch 10 | import torch.nn as nn 11 | import math 12 | from collections import OrderedDict 13 | 14 | class SpatialFusion(nn.Module): 15 | def __init__(self): 16 | super(SpatialFusion, self).__init__() 17 | # self.conv1 = nn.Conv3d(in_channels=2, out_channels=1, kernel_size=(3,3,3), stride=1, padding=1, groups=1) 18 | self.conv1 = nn.Sequential( 19 | OrderedDict( 20 | [ 21 | ('conv', nn.Conv3d(2, 1, kernel_size=(3,3,3),stride=1, padding=1, groups=1)), 22 | ('activation', nn.ReLU()), 23 | ] 24 | ) 25 | ) 26 | def regroup(self, x, record_len): 27 | cum_sum_len = torch.cumsum(record_len, dim=0) 28 | split_x = torch.tensor_split(x, cum_sum_len[:-1].cpu()) 29 | return split_x 30 | 31 | def forward(self, x, record_len): 32 | # x: B, C, H, W, split x:[(B1, C, W, H), (B2, C, W, H)] 33 | split_x = self.regroup(x, record_len) 34 | out = [] 35 | 36 | for xx in split_x: 37 | xx_max = torch.max(xx, dim=0, keepdim=True)[0] 38 | xx_avg = torch.mean(xx, dim=0, keepdim=True) 39 | F_Sp = torch.cat((xx_max,xx_avg),dim = 0).unsqueeze(0) 40 | # F_Sp = F_Sp.permute(0,2,1,3,4) 41 | # print(F_Sp.shape) 42 | # exit() 43 | # conv = nn.Conv3d(2, 1, kernel_size=(3,3,3), stride=1, padding=1) 44 | xx = self.conv1(F_Sp)[0] 45 | # print(aa.shape) 46 | # exit() 47 | out.append(xx) 48 | return torch.cat(out, dim=0) -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/point_pillar_scatter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class PointPillarScatter(nn.Module): 6 | def __init__(self, model_cfg): 7 | super().__init__() 8 | 9 | self.model_cfg = model_cfg 10 | self.num_bev_features = self.model_cfg['num_features'] 11 | self.nx, self.ny, self.nz = model_cfg['grid_size'] 12 | assert self.nz == 1 13 | 14 | def forward(self, batch_dict): 15 | pillar_features, coords = batch_dict['pillar_features'], batch_dict[ 16 | 'voxel_coords'] 17 | batch_spatial_features = [] 18 | batch_size = coords[:, 0].max().int().item() + 1 19 | 20 | for batch_idx in range(batch_size): 21 | spatial_feature = torch.zeros( 22 | self.num_bev_features, 23 | self.nz * self.nx * self.ny, 24 | dtype=pillar_features.dtype, 25 | device=pillar_features.device) 26 | 27 | batch_mask = coords[:, 0] == batch_idx 28 | this_coords = coords[batch_mask, :] 29 | 30 | indices = this_coords[:, 1] + \ 31 | this_coords[:, 2] * self.nx + \ 32 | this_coords[:, 3] 33 | indices = indices.type(torch.long) 34 | 35 | pillars = pillar_features[batch_mask, :] 36 | pillars = pillars.t() 37 | spatial_feature[:, indices] = pillars 38 | batch_spatial_features.append(spatial_feature) 39 | 40 | batch_spatial_features = \ 41 | torch.stack(batch_spatial_features, 0) 42 | batch_spatial_features = \ 43 | batch_spatial_features.view(batch_size, self.num_bev_features * 44 | self.nz, self.ny, self.nx) 45 | batch_dict['spatial_features'] = batch_spatial_features 46 | 47 | return batch_dict 48 | 49 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/downsample_conv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class used to downsample features by 3*3 conv 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | class DoubleConv(nn.Module): 10 | """ 11 | Double convoltuion 12 | Args: 13 | in_channels: input channel num 14 | out_channels: output channel num 15 | """ 16 | 17 | def __init__(self, in_channels, out_channels, kernel_size, 18 | stride, padding): 19 | super().__init__() 20 | self.double_conv = nn.Sequential( 21 | nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 22 | stride=stride, padding=padding), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), 25 | nn.ReLU(inplace=True) 26 | ) 27 | 28 | def forward(self, x): 29 | return self.double_conv(x) 30 | 31 | 32 | class DownsampleConv(nn.Module): 33 | def __init__(self, config): 34 | super(DownsampleConv, self).__init__() 35 | self.layers = nn.ModuleList([]) 36 | input_dim = config['input_dim'] 37 | 38 | for (ksize, dim, stride, padding) in zip(config['kernal_size'], 39 | config['dim'], 40 | config['stride'], 41 | config['padding']): 42 | self.layers.append(DoubleConv(input_dim, 43 | dim, 44 | kernel_size=ksize, 45 | stride=stride, 46 | padding=padding)) 47 | input_dim = dim 48 | 49 | def forward(self, x): 50 | for i in range(len(self.layers)): 51 | x = self.layers[i](x) 52 | return x -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/fuse_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from einops import rearrange 5 | from v2xvit.utils.common_utils import torch_tensor_to_numpy 6 | 7 | 8 | def regroup(dense_feature, record_len, max_len): 9 | """ 10 | Regroup the data based on the record_len. 11 | 12 | Parameters 13 | ---------- 14 | dense_feature : torch.Tensor 15 | N, C, H, W 16 | record_len : list 17 | [sample1_len, sample2_len, ...] 18 | max_len : int 19 | Maximum cav number 20 | 21 | Returns 22 | ------- 23 | regroup_feature : torch.Tensor 24 | B, L, C, H, W 25 | """ 26 | cum_sum_len = list(np.cumsum(torch_tensor_to_numpy(record_len))) 27 | split_features = torch.tensor_split(dense_feature, 28 | cum_sum_len[:-1]) 29 | regroup_features = [] 30 | mask = [] 31 | 32 | for split_feature in split_features: 33 | # M, C, H, W 34 | feature_shape = split_feature.shape 35 | 36 | # the maximum M is 5 as most 5 cavs 37 | padding_len = max_len - feature_shape[0] 38 | mask.append([1] * feature_shape[0] + [0] * padding_len) 39 | 40 | padding_tensor = torch.zeros(padding_len, feature_shape[1], 41 | feature_shape[2], feature_shape[3]) 42 | padding_tensor = padding_tensor.to(split_feature.device) 43 | 44 | split_feature = torch.cat([split_feature, padding_tensor], 45 | dim=0) 46 | 47 | # 1, 5C, H, W 48 | split_feature = split_feature.view(-1, 49 | feature_shape[2], 50 | feature_shape[3]).unsqueeze(0) 51 | regroup_features.append(split_feature) 52 | 53 | # B, 5C, H, W 54 | regroup_features = torch.cat(regroup_features, dim=0) 55 | # B, L, C, H, W 56 | regroup_features = rearrange(regroup_features, 57 | 'b (l c) h w -> b l c h w', 58 | l=max_len) 59 | mask = torch.from_numpy(np.array(mask)).to(regroup_features.device) 60 | 61 | return regroup_features, mask 62 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/split_attn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class RadixSoftmax(nn.Module): 7 | def __init__(self, radix, cardinality): 8 | super(RadixSoftmax, self).__init__() 9 | self.radix = radix 10 | self.cardinality = cardinality 11 | 12 | def forward(self, x): 13 | # x: (B, L, 1, 1, 3C) 14 | batch = x.size(0) 15 | cav_num = x.size(1) 16 | 17 | if self.radix > 1: 18 | # x: (B, L, 1, 3, C) 19 | x = x.view(batch, 20 | cav_num, 21 | self.cardinality, self.radix, -1) 22 | x = F.softmax(x, dim=3) 23 | # B, 3LC 24 | x = x.reshape(batch, -1) 25 | else: 26 | x = torch.sigmoid(x) 27 | return x 28 | 29 | 30 | class SplitAttn(nn.Module): 31 | def __init__(self, input_dim): 32 | super(SplitAttn, self).__init__() 33 | self.input_dim = input_dim 34 | 35 | self.fc1 = nn.Linear(input_dim, input_dim, bias=False) 36 | self.bn1 = nn.LayerNorm(input_dim) 37 | self.act1 = nn.ReLU() 38 | self.fc2 = nn.Linear(input_dim, input_dim * 3, bias=False) 39 | 40 | self.rsoftmax = RadixSoftmax(3, 1) 41 | 42 | def forward(self, window_list): 43 | # window list: [(B, L, H, W, C) * 3] 44 | assert len(window_list) == 3, 'only 3 windows are supported' 45 | 46 | sw, mw, bw = window_list[0], window_list[1], window_list[2] 47 | B, L = sw.shape[0], sw.shape[1] 48 | 49 | # global average pooling, B, L, H, W, C 50 | x_gap = sw + mw + bw 51 | # B, L, 1, 1, C 52 | x_gap = x_gap.mean((2, 3), keepdim=True) 53 | x_gap = self.act1(self.bn1(self.fc1(x_gap))) 54 | # B, L, 1, 1, 3C 55 | x_attn = self.fc2(x_gap) 56 | # B L 1 1 3C 57 | x_attn = self.rsoftmax(x_attn).view(B, L, 1, 1, -1) 58 | 59 | out = sw * x_attn[:, :, :, :, 0:self.input_dim] + \ 60 | mw * x_attn[:, :, :, :, self.input_dim:2*self.input_dim] +\ 61 | bw * x_attn[:, :, :, :, self.input_dim*2:] 62 | 63 | return out 64 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_intermediate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 11 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 12 | from v2xvit.models.sub_modules.att_bev_backbone import AttBEVBackbone 13 | 14 | 15 | class PointPillarIntermediate(nn.Module): 16 | def __init__(self, args): 17 | super(PointPillarIntermediate, self).__init__() 18 | 19 | # PIllar VFE 20 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 21 | num_point_features=4, 22 | voxel_size=args['voxel_size'], 23 | point_cloud_range=args['lidar_range']) 24 | 25 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 26 | self.backbone = AttBEVBackbone(args['base_bev_backbone'], 64) 27 | 28 | self.cls_head = nn.Conv2d(128 * 3, args['anchor_number'], 29 | kernel_size=1) 30 | self.reg_head = nn.Conv2d(128 * 3, 7 * args['anchor_num'], 31 | kernel_size=1) 32 | 33 | def forward(self, data_dict): 34 | 35 | voxel_features = data_dict['processed_lidar']['voxel_features'] 36 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 37 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 38 | record_len = data_dict['record_len'] 39 | 40 | batch_dict = {'voxel_features': voxel_features, 41 | 'voxel_coords': voxel_coords, 42 | 'voxel_num_points': voxel_num_points, 43 | 'record_len': record_len} 44 | 45 | batch_dict = self.pillar_vfe(batch_dict) 46 | batch_dict = self.scatter(batch_dict) 47 | batch_dict = self.backbone(batch_dict) 48 | 49 | spatial_features_2d = batch_dict['spatial_features_2d'] 50 | 51 | psm = self.cls_head(spatial_features_2d) 52 | rm = self.reg_head(spatial_features_2d) 53 | 54 | output_dict = {'psm': psm, 55 | 'rm': rm} 56 | 57 | return output_dict -------------------------------------------------------------------------------- /v2xvit/models/fuse_modules/fuse_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | import torch 7 | import numpy as np 8 | 9 | from einops import rearrange 10 | from v2xvit.utils.common_utils import torch_tensor_to_numpy 11 | 12 | 13 | def regroup(dense_feature, record_len, max_len): 14 | """ 15 | Regroup the data based on the record_len. 16 | 17 | Parameters 18 | ---------- 19 | dense_feature : torch.Tensor 20 | N, C, H, W 21 | record_len : list 22 | [sample1_len, sample2_len, ...] 23 | max_len : int 24 | Maximum cav number 25 | 26 | Returns 27 | ------- 28 | regroup_feature : torch.Tensor 29 | B, L, C, H, W 30 | """ 31 | cum_sum_len = list(np.cumsum(torch_tensor_to_numpy(record_len))) 32 | split_features = torch.tensor_split(dense_feature, 33 | cum_sum_len[:-1]) 34 | regroup_features = [] 35 | mask = [] 36 | 37 | for split_feature in split_features: 38 | # M, C, H, W 39 | feature_shape = split_feature.shape 40 | 41 | # the maximum M is 5 as most 5 cavs 42 | padding_len = max_len - feature_shape[0] 43 | mask.append([1] * feature_shape[0] + [0] * padding_len) 44 | 45 | padding_tensor = torch.zeros(padding_len, feature_shape[1], 46 | feature_shape[2], feature_shape[3]) 47 | padding_tensor = padding_tensor.to(split_feature.device) 48 | 49 | split_feature = torch.cat([split_feature, padding_tensor], 50 | dim=0) 51 | 52 | # 1, 5C, H, W 53 | split_feature = split_feature.view(-1, 54 | feature_shape[2], 55 | feature_shape[3]).unsqueeze(0) 56 | regroup_features.append(split_feature) 57 | 58 | # B, 5C, H, W 59 | regroup_features = torch.cat(regroup_features, dim=0) 60 | # B, L, C, H, W 61 | regroup_features = rearrange(regroup_features, 62 | 'b (l c) h w -> b l c h w', 63 | l=max_len) 64 | mask = torch.from_numpy(np.array(mask)).to(regroup_features.device) 65 | 66 | return regroup_features, mask 67 | -------------------------------------------------------------------------------- /v2xvit/data_utils/pre_processor/base_preprocessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from v2xvit.utils import pcd_utils 4 | 5 | 6 | class BasePreprocessor(object): 7 | """ 8 | Basic Lidar pre-processor. 9 | 10 | Parameters 11 | ---------- 12 | preprocess_params : dict 13 | The dictionary containing all parameters of the preprocessing. 14 | 15 | train : bool 16 | Train or test mode. 17 | """ 18 | 19 | def __init__(self, preprocess_params, train): 20 | self.params = preprocess_params 21 | self.train = train 22 | 23 | def preprocess(self, pcd_np): 24 | """ 25 | Preprocess the lidar points by simple sampling. 26 | 27 | Parameters 28 | ---------- 29 | pcd_np : np.ndarray 30 | The raw lidar. 31 | 32 | Returns 33 | ------- 34 | data_dict : the output dictionary. 35 | """ 36 | data_dict = {} 37 | sample_num = self.params['args']['sample_num'] 38 | 39 | pcd_np = pcd_utils.downsample_lidar(pcd_np, sample_num) 40 | data_dict['downsample_lidar'] = pcd_np 41 | 42 | return data_dict 43 | 44 | def project_points_to_bev_map(self, points, ratio=0.1): 45 | """ 46 | Project points to BEV occupancy map with default ratio=0.1. 47 | 48 | Parameters 49 | ---------- 50 | points : np.ndarray 51 | (N, 3) / (N, 4) 52 | 53 | ratio : float 54 | Discretization parameters. Default is 0.1. 55 | 56 | Returns 57 | ------- 58 | bev_map : np.ndarray 59 | BEV occupancy map including projected points with shape 60 | (img_row, img_col). 61 | 62 | """ 63 | L1, W1, H1, L2, W2, H2 = self.params["cav_lidar_range"] 64 | img_row = int((L2 - L1) / ratio) 65 | img_col = int((W2 - W1) / ratio) 66 | bev_map = np.zeros((img_row, img_col)) 67 | bev_origin = np.array([L1, W1, H1]).reshape(1, -1) 68 | # (N, 3) 69 | indices = ((points[:, :3] - bev_origin) / ratio).astype(int) 70 | mask = np.logical_and(indices[:, 0] > 0, indices[:, 0] < img_row) 71 | mask = np.logical_and(mask, np.logical_and(indices[:, 1] > 0, 72 | indices[:, 1] < img_col)) 73 | indices = indices[mask, :] 74 | bev_map[indices[:, 0], indices[:, 1]] = 1 75 | return bev_map 76 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/auto_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class AutoEncoder(nn.Module): 6 | def __init__(self, feature_num, layer_num): 7 | super().__init__() 8 | self.feature_num = feature_num 9 | self.feature_stride = 2 10 | 11 | self.encoder = nn.ModuleList() 12 | self.decoder = nn.ModuleList() 13 | 14 | for i in range(layer_num): 15 | cur_layers = [ 16 | nn.ZeroPad2d(1), 17 | nn.Conv2d( 18 | feature_num, feature_num, kernel_size=3, 19 | stride=2, padding=0, bias=False 20 | ), 21 | nn.BatchNorm2d(feature_num, eps=1e-3, momentum=0.01), 22 | nn.ReLU()] 23 | 24 | cur_layers.extend([ 25 | nn.Conv2d(feature_num, feature_num // self.feature_stride, 26 | kernel_size=3, padding=1, bias=False), 27 | nn.BatchNorm2d(feature_num // self.feature_stride, 28 | eps=1e-3, momentum=0.01), 29 | nn.ReLU() 30 | ]) 31 | 32 | self.encoder.append(nn.Sequential(*cur_layers)) 33 | feature_num = feature_num // self.feature_stride 34 | 35 | feature_num = self.feature_num 36 | for i in range(layer_num): 37 | cur_layers = [nn.Sequential( 38 | nn.ConvTranspose2d( 39 | feature_num // 2, feature_num, 40 | kernel_size=2, 41 | stride=2, bias=False 42 | ), 43 | nn.BatchNorm2d(feature_num, 44 | eps=1e-3, momentum=0.01), 45 | nn.ReLU() 46 | )] 47 | 48 | cur_layers.extend([nn.Sequential( 49 | nn.Conv2d( 50 | feature_num, feature_num, kernel_size=3, 51 | stride=1, bias=False, padding=1 52 | ), 53 | nn.BatchNorm2d(feature_num, eps=1e-3, 54 | momentum=0.01), 55 | nn.ReLU() 56 | )]) 57 | self.decoder.append(nn.Sequential(*cur_layers)) 58 | feature_num //= 2 59 | 60 | def forward(self, x): 61 | for i in range(len(self.encoder)): 62 | x = self.encoder[i](x) 63 | 64 | for i in range(len(self.decoder)-1, -1, -1): 65 | x = self.decoder[i](x) 66 | 67 | return x -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/self_attn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class ScaledDotProductAttention(nn.Module): 8 | """ 9 | Scaled Dot-Product Attention proposed in "Attention Is All You Need" 10 | Compute the dot products of the query with all keys, divide each by sqrt(dim), 11 | and apply a softmax function to obtain the weights on the values 12 | Args: dim, mask 13 | dim (int): dimention of attention 14 | mask (torch.Tensor): tensor containing indices to be masked 15 | Inputs: query, key, value, mask 16 | - **query** (batch, q_len, d_model): tensor containing projection vector for decoder. 17 | - **key** (batch, k_len, d_model): tensor containing projection vector for encoder. 18 | - **value** (batch, v_len, d_model): tensor containing features of the encoded input sequence. 19 | - **mask** (-): tensor containing indices to be masked 20 | Returns: context, attn 21 | - **context**: tensor containing the context vector from attention mechanism. 22 | - **attn**: tensor containing the attention (alignment) from the encoder outputs. 23 | """ 24 | 25 | def __init__(self, dim): 26 | super(ScaledDotProductAttention, self).__init__() 27 | self.sqrt_dim = np.sqrt(dim) 28 | 29 | def forward(self, query, key, value): 30 | score = torch.bmm(query, key.transpose(1, 2)) / self.sqrt_dim 31 | attn = F.softmax(score, -1) 32 | context = torch.bmm(attn, value) 33 | return context 34 | 35 | 36 | class AttFusion(nn.Module): 37 | def __init__(self, feature_dim): 38 | super(AttFusion, self).__init__() 39 | self.att = ScaledDotProductAttention(feature_dim) 40 | 41 | def forward(self, x, record_len): 42 | split_x = self.regroup(x, record_len) 43 | batch_size = len(record_len) 44 | C, W, H = split_x[0].shape[1:] 45 | out = [] 46 | for xx in split_x: 47 | cav_num = xx.shape[0] 48 | xx = xx.view(cav_num, C, -1).permute(2, 0, 1) 49 | h = self.att(xx, xx, xx) 50 | h = h.permute(1, 2, 0).view(cav_num, C, W, H)[0, ...].unsqueeze(0) 51 | out.append(h) 52 | return torch.cat(out, dim=0) 53 | 54 | def regroup(self, x, record_len): 55 | cum_sum_len = torch.cumsum(record_len, dim=0) 56 | split_x = torch.tensor_split(x, cum_sum_len[:-1].cpu()) 57 | return split_x 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | logs/ 131 | *.c 132 | *.so 133 | .idea 134 | opv2x 135 | .DS_Store 136 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vanilla pointpillar for early and late fusion. 3 | """ 4 | import torch.nn as nn 5 | 6 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 7 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 8 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 9 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 10 | 11 | 12 | class PointPillar(nn.Module): 13 | def __init__(self, args): 14 | super(PointPillar, self).__init__() 15 | 16 | # PIllar VFE 17 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 18 | num_point_features=4, 19 | voxel_size=args['voxel_size'], 20 | point_cloud_range=args['lidar_range']) 21 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 22 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 23 | # used to downsample the feature map for efficient computation 24 | self.shrink_flag = False 25 | if 'shrink_header' in args: 26 | self.shrink_flag = True 27 | self.shrink_conv = DownsampleConv(args['shrink_header']) 28 | 29 | self.cls_head = nn.Conv2d(args['cls_head_dim'], args['anchor_number'], 30 | kernel_size=1) 31 | self.reg_head = nn.Conv2d(args['cls_head_dim'], 32 | 7 * args['anchor_number'], 33 | kernel_size=1) 34 | 35 | def forward(self, data_dict): 36 | 37 | voxel_features = data_dict['processed_lidar']['voxel_features'] 38 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 39 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 40 | 41 | batch_dict = {'voxel_features': voxel_features, 42 | 'voxel_coords': voxel_coords, 43 | 'voxel_num_points': voxel_num_points} 44 | 45 | batch_dict = self.pillar_vfe(batch_dict) 46 | batch_dict = self.scatter(batch_dict) 47 | batch_dict = self.backbone(batch_dict) 48 | 49 | spatial_features_2d = batch_dict['spatial_features_2d'] 50 | if self.shrink_flag: 51 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 52 | 53 | psm = self.cls_head(spatial_features_2d) 54 | rm = self.reg_head(spatial_features_2d) 55 | 56 | output_dict = {'psm': psm, 57 | 'rm': rm} 58 | 59 | return output_dict -------------------------------------------------------------------------------- /v2xvit/models/fuse_modules/self_attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Hao Xiang , Runsheng Xu 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | class ScaledDotProductAttention(nn.Module): 13 | """ 14 | Scaled Dot-Product Attention proposed in "Attention Is All You Need" 15 | Compute the dot products of the query with all keys, divide each by sqrt(dim), 16 | and apply a softmax function to obtain the weights on the values 17 | Args: dim, mask 18 | dim (int): dimention of attention 19 | mask (torch.Tensor): tensor containing indices to be masked 20 | Inputs: query, key, value, mask 21 | - **query** (batch, q_len, d_model): tensor containing projection 22 | vector for decoder. 23 | - **key** (batch, k_len, d_model): tensor containing projection 24 | vector for encoder. 25 | - **value** (batch, v_len, d_model): tensor containing features of the 26 | encoded input sequence. 27 | - **mask** (-): tensor containing indices to be masked 28 | Returns: context, attn 29 | - **context**: tensor containing the context vector from 30 | attention mechanism. 31 | - **attn**: tensor containing the attention (alignment) from the 32 | encoder outputs. 33 | """ 34 | 35 | def __init__(self, dim): 36 | super(ScaledDotProductAttention, self).__init__() 37 | self.sqrt_dim = np.sqrt(dim) 38 | 39 | def forward(self, query, key, value): 40 | score = torch.bmm(query, key.transpose(1, 2)) / self.sqrt_dim 41 | attn = F.softmax(score, -1) 42 | context = torch.bmm(attn, value) 43 | return context 44 | 45 | 46 | class AttFusion(nn.Module): 47 | def __init__(self, feature_dim): 48 | super(AttFusion, self).__init__() 49 | self.att = ScaledDotProductAttention(feature_dim) 50 | 51 | def forward(self, x, record_len): 52 | split_x = self.regroup(x, record_len) 53 | batch_size = len(record_len) 54 | C, W, H = split_x[0].shape[1:] 55 | out = [] 56 | for xx in split_x: 57 | cav_num = xx.shape[0] 58 | xx = xx.view(cav_num, C, -1).permute(2, 0, 1) 59 | h = self.att(xx, xx, xx) 60 | h = h.permute(1, 2, 0).view(cav_num, C, W, H)[0, ...].unsqueeze(0) 61 | out.append(h) 62 | return torch.cat(out, dim=0) 63 | 64 | def regroup(self, x, record_len): 65 | cum_sum_len = torch.cumsum(record_len, dim=0) 66 | split_x = torch.tensor_split(x, cum_sum_len[:-1].cpu()) 67 | return split_x 68 | -------------------------------------------------------------------------------- /v2xvit/tools/debug_utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from torch.utils.data import DataLoader 5 | 6 | import v2xvit.hypes_yaml.yaml_utils as yaml_utils 7 | from v2xvit.tools import train_utils 8 | from v2xvit.data_utils.datasets import build_dataset 9 | from v2xvit.visualization import vis_utils 10 | 11 | 12 | def test_parser(): 13 | parser = argparse.ArgumentParser(description="synthetic data generation") 14 | parser.add_argument('--model_dir', type=str, required=True, 15 | help='Continued training path') 16 | parser.add_argument('--fusion_method', type=str, default='late', 17 | help='late, early or intermediate') 18 | opt = parser.parse_args() 19 | return opt 20 | 21 | 22 | def test_bev_post_processing(): 23 | opt = test_parser() 24 | assert opt.fusion_method in ['late', 'early', 'intermediate'] 25 | 26 | hypes = yaml_utils.load_yaml(None, opt) 27 | 28 | print('Dataset Building') 29 | opencood_dataset = build_dataset(hypes, visualize=True, train=False) 30 | data_loader = DataLoader(opencood_dataset, 31 | batch_size=1, 32 | num_workers=0, 33 | collate_fn=opencood_dataset.collate_batch_test, 34 | shuffle=False, 35 | pin_memory=False, 36 | drop_last=False) 37 | 38 | print('Creating Model') 39 | model = train_utils.create_model(hypes) 40 | # we assume gpu is necessary 41 | if torch.cuda.is_available(): 42 | model.cuda() 43 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 44 | 45 | print('Loading Model from checkpoint') 46 | saved_path = opt.model_dir 47 | _, model = train_utils.load_saved_model(saved_path, model) 48 | model.eval() 49 | for i, batch_data in enumerate(data_loader): 50 | batch_data = train_utils.to_device(batch_data, device) 51 | label_map = batch_data["ego"]["label_dict"]["label_map"] 52 | output_dict = { 53 | "cls": label_map[:, 0, :, :], 54 | "reg": label_map[:, 1:, :, :] 55 | } 56 | gt_box_tensor, _ = opencood_dataset.post_processor.post_process_debug( 57 | batch_data["ego"], output_dict) 58 | vis_utils.visualize_single_sample_output_bev(gt_box_tensor, 59 | batch_data['ego'][ 60 | 'origin_lidar'].squeeze( 61 | 0), 62 | opencood_dataset) 63 | 64 | 65 | if __name__ == '__main__': 66 | test_bev_post_processing() 67 | -------------------------------------------------------------------------------- /docs/data_annotation_tutorial.md: -------------------------------------------------------------------------------- 1 | ## Data Annotation Introduction 2 | 3 | --- 4 | We save all groundtruth annotations per agent per timestamp in the yaml files. For instance, 5 | `2021_08_24_21_29_28/4805/000069.yaml` refers to the data annotations with the perspective of te 6 | agent 4805 at timestamp 69 in the scenario database `2021_08_24_21_29_28`. Here we go through an example: 7 | 8 | ```yaml 9 | camera0: # parameters for frontal camera 10 | cords: # the x,y,z,roll,yaw,pitch under CARLA map coordinate 11 | - 141.35067749023438 12 | - -388.642578125 13 | - 1.0410505533218384 14 | - 0.07589337974786758 15 | - 174.18048095703125 16 | - 0.20690691471099854 17 | extrinsic: # extrinsic matrix from camera to LiDAR 18 | - - 0.9999999999999999 19 | - -5.1230071481984265e-18 20 | - 9.322129061605055e-20 21 | - -2.999993025731527 22 | - - -2.5011383190939924e-18 23 | - 1.0 24 | - 1.1458579204685086e-19 25 | - -3.934422863949294e-06 26 | - - 2.7713237218713775e-20 27 | - 3.7310309839064755e-20 28 | - 1.0 29 | - 0.8999999040861146 30 | - - 0.0 31 | - 0.0 32 | - 0.0 33 | - 1.0 34 | intrinsic: # camera intrinsic matrix 35 | - - 335.639852470912 36 | - 0.0 37 | - 400.0 38 | - - 0.0 39 | - 335.639852470912 40 | - 300.0 41 | - - 0.0 42 | - 0.0 43 | - 1.0 44 | camera1: ... # params of right rear camera 45 | camera2: ... # params of left rear camera 46 | canera3: ... # params of back camera 47 | ego_speed: 18.13 # agent's current speed, km/h 48 | lidar_pose: # LiDAR pose under CARLA map coordinate system 49 | - 144.33 50 | - -388.94 51 | - 1.93 52 | - 0.078 53 | - 174.18 54 | - 0.21 55 | plan_trajectory: # agent's planning trajectory 56 | - - 140. 57 | - -388 58 | - 87 59 | predicted_ego_pos: # agent's localization (x,y,z,roll,yaw,pitch) gained from GPS 60 | - 143.78 61 | - -388.94 62 | - 0.036 63 | - 0.080 64 | - -185.95 65 | - 0.18 66 | true_ego_pos: # agent's true localization 67 | - 143.83 68 | - -388.89 69 | - 0.032 70 | - 0.075 71 | - 174.18 72 | - 0.21 73 | vehicles: # the surrounding vehicles that have at least one LiDAR point hit from the agent 74 | 4796: # the id of the vehicle (i.e. object) 75 | angle: # roll, yaw, pitch under CARLA map coordinate system 76 | - 0.096 77 | - -177.86 78 | - 0.197 79 | center: # the relative position from bounding box center to the frontal axis of this vehicle 80 | - 0.0004 81 | - 0.0005 82 | - 0.71 83 | extent: # half length, width and height of the vehicle in meter 84 | - 2.45 85 | - 1.06 86 | - 0.75 87 | location: # x, y ,z position of the center in the frontal axis of the vehicle under CARLA map coordinate system 88 | - 158.55 89 | - -385.75 90 | - 0.032 91 | speed: 19.47 # vehicle's speed 92 | 4880: ... 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /v2xvit/data_utils/augmentor/augment_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from v2xvit.utils import common_utils 4 | 5 | 6 | def random_flip_along_x(gt_boxes, points): 7 | """ 8 | Args: 9 | gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] 10 | points: (M, 3 + C) 11 | Returns: 12 | """ 13 | enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5]) 14 | if enable: 15 | gt_boxes[:, 1] = -gt_boxes[:, 1] 16 | gt_boxes[:, 6] = -gt_boxes[:, 6] 17 | points[:, 1] = -points[:, 1] 18 | 19 | if gt_boxes.shape[1] > 7: 20 | gt_boxes[:, 8] = -gt_boxes[:, 8] 21 | 22 | return gt_boxes, points 23 | 24 | 25 | def random_flip_along_y(gt_boxes, points): 26 | """ 27 | Args: 28 | gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] 29 | points: (M, 3 + C) 30 | Returns: 31 | """ 32 | enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5]) 33 | if enable: 34 | gt_boxes[:, 0] = -gt_boxes[:, 0] 35 | gt_boxes[:, 6] = -(gt_boxes[:, 6] + np.pi) 36 | points[:, 0] = -points[:, 0] 37 | 38 | if gt_boxes.shape[1] > 7: 39 | gt_boxes[:, 7] = -gt_boxes[:, 7] 40 | 41 | return gt_boxes, points 42 | 43 | 44 | def global_rotation(gt_boxes, points, rot_range): 45 | """ 46 | Args: 47 | gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] 48 | points: (M, 3 + C), 49 | rot_range: [min, max] 50 | Returns: 51 | """ 52 | noise_rotation = np.random.uniform(rot_range[0], 53 | rot_range[1]) 54 | points = common_utils.rotate_points_along_z(points[np.newaxis, :, :], 55 | np.array([noise_rotation]))[0] 56 | 57 | gt_boxes[:, 0:3] = \ 58 | common_utils.rotate_points_along_z(gt_boxes[np.newaxis, :, 0:3], 59 | np.array([noise_rotation]))[0] 60 | gt_boxes[:, 6] += noise_rotation 61 | 62 | if gt_boxes.shape[1] > 7: 63 | gt_boxes[:, 7:9] = common_utils.rotate_points_along_z( 64 | np.hstack((gt_boxes[:, 7:9], np.zeros((gt_boxes.shape[0], 1))))[ 65 | np.newaxis, :, :], 66 | np.array([noise_rotation]))[0][:, 0:2] 67 | 68 | return gt_boxes, points 69 | 70 | 71 | def global_scaling(gt_boxes, points, scale_range): 72 | """ 73 | Args: 74 | gt_boxes: (N, 7), [x, y, z, dx, dy, dz, heading] 75 | points: (M, 3 + C), 76 | scale_range: [min, max] 77 | Returns: 78 | """ 79 | if scale_range[1] - scale_range[0] < 1e-3: 80 | return gt_boxes, points 81 | noise_scale = np.random.uniform(scale_range[0], scale_range[1]) 82 | points[:, :3] *= noise_scale 83 | gt_boxes[:, :6] *= noise_scale 84 | 85 | return gt_boxes, points 86 | -------------------------------------------------------------------------------- /v2xvit/utils/transformation_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transformation utils 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | def x_to_world(pose): 9 | """ 10 | The transformation matrix from x-coordinate system to carla world system 11 | 12 | Parameters 13 | ---------- 14 | pose : list 15 | [x, y, z, roll, yaw, pitch] 16 | 17 | Returns 18 | ------- 19 | matrix : np.ndarray 20 | The transformation matrix. 21 | """ 22 | x, y, z, roll, yaw, pitch = pose[:] 23 | 24 | # used for rotation matrix 25 | c_y = np.cos(np.radians(yaw)) 26 | s_y = np.sin(np.radians(yaw)) 27 | c_r = np.cos(np.radians(roll)) 28 | s_r = np.sin(np.radians(roll)) 29 | c_p = np.cos(np.radians(pitch)) 30 | s_p = np.sin(np.radians(pitch)) 31 | 32 | matrix = np.identity(4) 33 | # translation matrix 34 | matrix[0, 3] = x 35 | matrix[1, 3] = y 36 | matrix[2, 3] = z 37 | 38 | # rotation matrix 39 | matrix[0, 0] = c_p * c_y 40 | matrix[0, 1] = c_y * s_p * s_r - s_y * c_r 41 | matrix[0, 2] = -c_y * s_p * c_r - s_y * s_r 42 | matrix[1, 0] = s_y * c_p 43 | matrix[1, 1] = s_y * s_p * s_r + c_y * c_r 44 | matrix[1, 2] = -s_y * s_p * c_r + c_y * s_r 45 | matrix[2, 0] = s_p 46 | matrix[2, 1] = -c_p * s_r 47 | matrix[2, 2] = c_p * c_r 48 | 49 | return matrix 50 | 51 | 52 | def x1_to_x2(x1, x2): 53 | """ 54 | Transformation matrix from x1 to x2. 55 | 56 | Parameters 57 | ---------- 58 | x1 : list 59 | The pose of x1 under world coordinates. 60 | x2 : list 61 | The pose of x2 under world coordinates. 62 | 63 | Returns 64 | ------- 65 | transformation_matrix : np.ndarray 66 | The transformation matrix. 67 | 68 | """ 69 | x1_to_world = x_to_world(x1) 70 | x2_to_world = x_to_world(x2) 71 | world_to_x2 = np.linalg.inv(x2_to_world) 72 | 73 | transformation_matrix = np.dot(world_to_x2, x1_to_world) 74 | return transformation_matrix 75 | 76 | 77 | def dist_to_continuous(p_dist, displacement_dist, res, downsample_rate): 78 | """ 79 | Convert points discretized format to continuous space for BEV representation. 80 | Parameters 81 | ---------- 82 | p_dist : numpy.array 83 | Points in discretized coorindates. 84 | 85 | displacement_dist : numpy.array 86 | Discretized coordinates of bottom left origin. 87 | 88 | res : float 89 | Discretization resolution. 90 | 91 | downsample_rate : int 92 | Dowmsamping rate. 93 | 94 | Returns 95 | ------- 96 | p_continuous : numpy.array 97 | Points in continuous coorindates. 98 | 99 | """ 100 | p_dist = np.copy(p_dist) 101 | p_dist = p_dist + displacement_dist 102 | p_continuous = p_dist * res * downsample_rate 103 | return p_continuous 104 | -------------------------------------------------------------------------------- /v2xvit/tools/inference_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import OrderedDict 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from v2xvit.utils.common_utils import torch_tensor_to_numpy 8 | 9 | 10 | def inference_late_fusion(batch_data, model, dataset): 11 | """ 12 | Model inference for late fusion. 13 | 14 | Parameters 15 | ---------- 16 | batch_data : dict 17 | model : opencood.object 18 | dataset : opencood.LateFusionDataset 19 | 20 | Returns 21 | ------- 22 | pred_box_tensor : torch.Tensor 23 | The tensor of prediction bounding box after NMS. 24 | gt_box_tensor : torch.Tensor 25 | The tensor of gt bounding box. 26 | """ 27 | output_dict = OrderedDict() 28 | 29 | for cav_id, cav_content in batch_data.items(): 30 | output_dict[cav_id] = model(cav_content) 31 | 32 | pred_box_tensor, pred_score, gt_box_tensor = \ 33 | dataset.post_process(batch_data, 34 | output_dict) 35 | 36 | return pred_box_tensor, pred_score, gt_box_tensor 37 | 38 | 39 | def inference_early_fusion(batch_data, model, dataset): 40 | """ 41 | Model inference for early fusion. 42 | 43 | Parameters 44 | ---------- 45 | batch_data : dict 46 | model : opencood.object 47 | dataset : opencood.EarlyFusionDataset 48 | 49 | Returns 50 | ------- 51 | pred_box_tensor : torch.Tensor 52 | The tensor of prediction bounding box after NMS. 53 | gt_box_tensor : torch.Tensor 54 | The tensor of gt bounding box. 55 | """ 56 | output_dict = OrderedDict() 57 | cav_content = batch_data['ego'] 58 | 59 | output_dict['ego'] = model(cav_content) 60 | 61 | pred_box_tensor, pred_score, gt_box_tensor = \ 62 | dataset.post_process(batch_data, 63 | output_dict) 64 | 65 | return pred_box_tensor, pred_score, gt_box_tensor 66 | 67 | 68 | def inference_intermediate_fusion(batch_data, model, dataset): 69 | """ 70 | Model inference for early fusion. 71 | 72 | Parameters 73 | ---------- 74 | batch_data : dict 75 | model : opencood.object 76 | dataset : opencood.EarlyFusionDataset 77 | 78 | Returns 79 | ------- 80 | pred_box_tensor : torch.Tensor 81 | The tensor of prediction bounding box after NMS. 82 | gt_box_tensor : torch.Tensor 83 | The tensor of gt bounding box. 84 | """ 85 | return inference_early_fusion(batch_data, model, dataset) 86 | 87 | 88 | def save_prediction_gt(pred_tensor, gt_tensor, pcd, timestamp, save_path): 89 | """ 90 | Save prediction and gt tensor to txt file. 91 | """ 92 | pred_np = torch_tensor_to_numpy(pred_tensor) 93 | gt_np = torch_tensor_to_numpy(gt_tensor) 94 | pcd_np = torch_tensor_to_numpy(pcd) 95 | 96 | np.save(os.path.join(save_path, '%04d_pcd.npy' % timestamp), pcd_np) 97 | np.save(os.path.join(save_path, '%04d_pred.npy' % timestamp), pred_np) 98 | np.save(os.path.join(save_path, '%04d_gt.npy' % timestamp), gt_np) 99 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_early_fusion.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_early_fusion 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | yaml_parser: "load_point_pillar_params" 5 | 6 | wild_setting: 7 | async: false 8 | async_overhead: 60 9 | seed: 20 10 | loc_err: false 11 | xyz_std: 0.2 12 | ryp_std: 0.2 13 | 14 | train_params: 15 | batch_size: &batch_size 4 16 | epoches: 22 17 | eval_freq: 1 18 | save_freq: 1 19 | 20 | fusion: 21 | core_method: 'EarlyFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 22 | args: [] 23 | 24 | # preprocess-related 25 | preprocess: 26 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 27 | core_method: 'SpVoxelPreprocessor' 28 | args: 29 | voxel_size: &voxel_size [0.4, 0.4, 4] 30 | max_points_per_voxel: 32 31 | max_voxel_train: 32000 32 | max_voxel_test: 70000 33 | # lidar range for each individual cav. 34 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] 35 | 36 | data_augment: 37 | - NAME: random_world_flip 38 | ALONG_AXIS_LIST: [ 'x' ] 39 | 40 | - NAME: random_world_rotation 41 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 42 | 43 | - NAME: random_world_scaling 44 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 45 | 46 | # anchor box related 47 | postprocess: 48 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 49 | anchor_args: 50 | cav_lidar_range: *cav_lidar 51 | l: 3.9 52 | w: 1.6 53 | h: 1.56 54 | r: [0, 90] 55 | num: &achor_num 2 56 | feature_stride: 4 57 | target_args: 58 | pos_threshold: 0.6 59 | neg_threshold: 0.45 60 | score_threshold: 0.20 61 | order: 'hwl' # hwl or lwh 62 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 63 | nms_thresh: 0.15 64 | 65 | # model related 66 | model: 67 | core_method: point_pillar 68 | args: 69 | voxel_size: *voxel_size 70 | lidar_range: *cav_lidar 71 | anchor_number: *achor_num 72 | pillar_vfe: 73 | use_norm: true 74 | with_distance: false 75 | use_absolute_xyz: true 76 | num_filters: [64] 77 | point_pillar_scatter: 78 | num_features: 64 79 | 80 | base_bev_backbone: 81 | layer_nums: [3, 5, 8] 82 | layer_strides: [2, 2, 2] 83 | num_filters: [64, 128, 256] 84 | upsample_strides: [1, 2, 4] 85 | num_upsample_filter: [128, 128, 128] 86 | 87 | shrink_header: 88 | kernal_size: [ 3 ] 89 | stride: [ 2 ] 90 | padding: [ 1 ] 91 | dim: [ 256 ] 92 | input_dim: 384 # 128 * 3 93 | 94 | cls_head_dim: 256 95 | 96 | anchor_num: *achor_num 97 | 98 | loss: 99 | core_method: point_pillar_loss 100 | args: 101 | cls_weight: 1.0 102 | reg: 2.0 103 | 104 | optimizer: 105 | core_method: Adam 106 | lr: 0.002 107 | args: 108 | eps: 1e-10 109 | weight_decay: 1e-4 110 | 111 | lr_scheduler: 112 | core_method: multistep #step, multistep and Exponential support 113 | gamma: 0.1 114 | step_size: [20, 30] 115 | 116 | -------------------------------------------------------------------------------- /docs/data_intro.md: -------------------------------------------------------------------------------- 1 | ## Data Introduction 2 | 3 | --- 4 | 5 | V2XSet data is structured as following: 6 | 7 | ```sh 8 | V2XSet 9 | ├── train # data for training 10 | │ ├── 2021_08_22_21_41_24 # scenario folder 11 | │ ├── data_protocol.yaml # the simulation parameters used to collect the data in Carla 12 | │ └── -1 # The infra's id 13 | │ └── 00000.pcd - 00700.pcd # the point clouds data from timestamp 0 to 700 14 | │ ├── 00000.yaml - 00700.yaml # corresponding metadata for each timestamp 15 | │ ├── 00000_camera0.png - 00700_camera0.png # frontal camera images 16 | │ ├── 00000_camera1.png - 00700_camera1.png # right rear camera images 17 | │ ├── 00000_camera2.png - 00700_camera2.png # left rear camera images 18 | │ └── 00000_camera3.png - 00700_camera3.png # back camera images 19 | | └── 112 # The connected vehicle id 20 | ├── validate 21 | ├── test 22 | ``` 23 | 24 | ### 1. Data Split 25 | OPV2V dataset can be divided into 4 different folders: `train`, `validation`, `test` 26 | - `train`: contains all training data 27 | - `validate`: used for validation during training 28 | - `test`: test set 29 | 30 | ### 2. Scenario Database 31 | V2XSet has 58 scenarios in total, where each of them contains data stream from different agents across different timestamps. 32 | Each scenario is named by the time it was gathered, e.g., `2021_08_22_21_41_24`. 33 | 34 | ### 3. Agent Contents 35 | Under each scenario folder, the data of every intelligent agent~(i.e. infrastructure or connected automated vehicle) appearing in the current scenario is saved in different folders. Each folder is named by the agent's unique id, e.g., 1732. A negative id means infrastructure. 36 | 37 | In each agent folder, data across different timestamps will be saved. Those timestamps are represented by five digits integers 38 | as the prefix of the filenames (e.g., 00700.pcd). There are three types of files inside the agent folders: LiDAR point clouds (`.pcd` files), camera images (`.png` files), and metadata (`.yaml` files). 39 | 40 | #### 3.1 Lidar point cloud 41 | The LiDAR data is saved with Open3d package and has a postfix ".pcd" in the name. 42 | 43 | #### 3.2 Camera images 44 | Each CAV and Infra is equipped with 4 RGB cameras to capture the 360 degree of view of the surrounding scene.`camera0`, `camera1`, `camera2`, and `camera3` represent the front, right rear, left rear, and back cameras respectively. 45 | 46 | #### 3.3 Data Annotation 47 | All the metadata is saved in yaml files. It records the following important information at the current timestamp: 48 | - **ego information**: Current ego pose with and without GPS noise under Carla world coordinates, ego speed in km/h, the LiDAR pose, and future planning trajectories. 49 | - **calibration**: The intrinsic matrix and extrinsic matrix from each camera to the LiDAR sensor. 50 | - **objects annotation**: The pose and velocity of each surrounding human driving vehicle that has at least one point hit by the agent's LiDAR sensor. See [data annotation section](data_annotation_tutorial.md) for more details. 51 | 52 | ### 4. Data Collection Protocol 53 | Besides agent contents, every scenario database also has a yaml file named `data_protocol.yaml`. 54 | This yaml file records the simulation configuration to collect the current scenario. 55 | 56 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_late_fusion.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_late_fusion 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | yaml_parser: "load_point_pillar_params" 5 | 6 | wild_setting: 7 | async: false 8 | async_overhead: 100 9 | seed: 20 10 | loc_err: false 11 | xyz_std: 0.2 12 | ryp_std: 0.2 13 | 14 | noise_setting: 15 | add_noise: false 16 | args: 17 | pos_std: 0 18 | rot_std: 0 19 | pos_mean: 0 20 | rot_mean: 0 21 | 22 | 23 | train_params: 24 | batch_size: &batch_size 8 25 | epoches: 25 26 | eval_freq: 1 27 | save_freq: 1 28 | 29 | fusion: 30 | core_method: 'LateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 31 | args: [] 32 | 33 | # preprocess-related 34 | preprocess: 35 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 36 | core_method: 'SpVoxelPreprocessor' 37 | args: 38 | voxel_size: &voxel_size [0.4, 0.4, 4] 39 | max_points_per_voxel: 32 40 | max_voxel_train: 16000 41 | max_voxel_test: 40000 42 | # lidar range for each individual cav. 43 | cav_lidar_range: &cav_lidar [-70.4, -40, -3, 70.4, 40, 1] 44 | 45 | data_augment: 46 | - NAME: random_world_flip 47 | ALONG_AXIS_LIST: [ 'x' ] 48 | 49 | - NAME: random_world_rotation 50 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 51 | 52 | - NAME: random_world_scaling 53 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 54 | 55 | # anchor box related 56 | postprocess: 57 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 58 | anchor_args: 59 | cav_lidar_range: *cav_lidar 60 | l: 3.9 61 | w: 1.6 62 | h: 1.56 63 | r: [0, 90] 64 | feature_stride: 4 65 | num: &achor_num 2 66 | target_args: 67 | pos_threshold: 0.6 68 | neg_threshold: 0.45 69 | score_threshold: 0.20 70 | order: 'hwl' # hwl or lwh 71 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 72 | nms_thresh: 0.15 73 | 74 | # model related 75 | model: 76 | core_method: point_pillar 77 | args: 78 | voxel_size: *voxel_size 79 | lidar_range: *cav_lidar 80 | anchor_number: *achor_num 81 | pillar_vfe: 82 | use_norm: true 83 | with_distance: false 84 | use_absolute_xyz: true 85 | num_filters: [64] 86 | point_pillar_scatter: 87 | num_features: 64 88 | 89 | base_bev_backbone: 90 | layer_nums: [3, 5, 8] 91 | layer_strides: [2, 2, 2] 92 | num_filters: [64, 128, 256] 93 | upsample_strides: [1, 2, 4] 94 | num_upsample_filter: [128, 128, 128] 95 | 96 | shrink_header: 97 | kernal_size: [ 3 ] 98 | stride: [ 2 ] 99 | padding: [ 1 ] 100 | dim: [ 256 ] 101 | input_dim: 384 # 128 * 3 102 | 103 | cls_head_dim: 256 104 | 105 | anchor_num: *achor_num 106 | 107 | loss: 108 | core_method: point_pillar_loss 109 | args: 110 | cls_weight: 1.0 111 | reg: 2.0 112 | 113 | optimizer: 114 | core_method: Adam 115 | lr: 0.002 116 | args: 117 | eps: 1e-10 118 | weight_decay: 1e-4 119 | 120 | lr_scheduler: 121 | core_method: multistep #step, multistep and Exponential support 122 | gamma: 0.1 123 | step_size: [20, 30] 124 | 125 | -------------------------------------------------------------------------------- /v2xvit/loss/voxel_net_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class VoxelNetLoss(nn.Module): 7 | def __init__(self, args): 8 | super(VoxelNetLoss, self).__init__() 9 | self.smoothl1loss = nn.SmoothL1Loss(size_average=False) 10 | self.alpha = args['alpha'] 11 | self.beta = args['beta'] 12 | self.reg_coe = args['reg'] 13 | self.loss_dict = {} 14 | 15 | def forward(self, output_dict, target_dict): 16 | """ 17 | Parameters 18 | ---------- 19 | output_dict : dict 20 | target_dict : dict 21 | """ 22 | rm = output_dict['rm'] 23 | psm = output_dict['psm'] 24 | 25 | pos_equal_one = target_dict['pos_equal_one'] 26 | neg_equal_one = target_dict['neg_equal_one'] 27 | targets = target_dict['targets'] 28 | 29 | p_pos = F.sigmoid(psm.permute(0, 2, 3, 1)) 30 | rm = rm.permute(0, 2, 3, 1).contiguous() 31 | rm = rm.view(rm.size(0), rm.size(1), rm.size(2), -1, 7) 32 | targets = targets.view(targets.size(0), targets.size(1), 33 | targets.size(2), -1, 7) 34 | pos_equal_one_for_reg = pos_equal_one.unsqueeze( 35 | pos_equal_one.dim()).expand(-1, -1, -1, -1, 7) 36 | 37 | rm_pos = rm * pos_equal_one_for_reg 38 | targets_pos = targets * pos_equal_one_for_reg 39 | 40 | cls_pos_loss = -pos_equal_one * torch.log(p_pos + 1e-6) 41 | cls_pos_loss = cls_pos_loss.sum() / (pos_equal_one.sum() + 1e-6) 42 | 43 | cls_neg_loss = -neg_equal_one * torch.log(1 - p_pos + 1e-6) 44 | cls_neg_loss = cls_neg_loss.sum() / (neg_equal_one.sum() + 1e-6) 45 | 46 | reg_loss = self.smoothl1loss(rm_pos, targets_pos) 47 | reg_loss = reg_loss / (pos_equal_one.sum() + 1e-6) 48 | conf_loss = self.alpha * cls_pos_loss + self.beta * cls_neg_loss 49 | 50 | total_loss = self.reg_coe * reg_loss + conf_loss 51 | 52 | self.loss_dict.update({'total_loss': total_loss, 53 | 'reg_loss': reg_loss, 54 | 'conf_loss': conf_loss}) 55 | 56 | return total_loss 57 | 58 | def logging(self, epoch, batch_id, batch_len, writer): 59 | """ 60 | Print out the loss function for current iteration. 61 | 62 | Parameters 63 | ---------- 64 | epoch : int 65 | Current epoch for training. 66 | batch_id : int 67 | The current batch. 68 | batch_len : int 69 | Total batch length in one iteration of training, 70 | writer : SummaryWriter 71 | Used to visualize on tensorboard 72 | """ 73 | total_loss = self.loss_dict['total_loss'] 74 | reg_loss = self.loss_dict['reg_loss'] 75 | conf_loss = self.loss_dict['conf_loss'] 76 | 77 | print("[epoch %d][%d/%d], || Loss: %.4f || Conf Loss: %.4f" 78 | " || Loc Loss: %.4f" % ( 79 | epoch, batch_id + 1, batch_len, 80 | total_loss.item(), conf_loss.item(), reg_loss.item())) 81 | 82 | writer.add_scalar('Regression_loss', reg_loss.item(), 83 | epoch*batch_len + batch_id) 84 | writer.add_scalar('Confidence_loss', conf_loss.item(), 85 | epoch*batch_len + batch_id) 86 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_cobevt.yaml: -------------------------------------------------------------------------------- 1 | name: corpbevtlidar 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | wild_setting: 6 | async: false 7 | async_overhead: 100 8 | seed: 20 9 | loc_err: false 10 | xyz_std: 0.2 11 | ryp_std: 0.2 12 | data_size: 1.06 # Mb!! 13 | transmission_speed: 27 # Mbps!! 14 | backbone_delay: 10 # ms 15 | 16 | yaml_parser: "load_point_pillar_params" 17 | train_params: 18 | batch_size: &batch_size 2 19 | epoches: &epoches 90 20 | eval_freq: 2 21 | save_freq: 1 22 | max_cav: &max_cav 5 23 | 24 | fusion: 25 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 26 | args: 27 | cur_ego_pose_flag: true 28 | 29 | # preprocess-related 30 | preprocess: 31 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 32 | core_method: 'SpVoxelPreprocessor' 33 | args: 34 | voxel_size: &voxel_size [0.4, 0.4, 4] 35 | max_points_per_voxel: 32 36 | max_voxel_train: 32000 37 | max_voxel_test: 70000 38 | # lidar range for each individual cav. 39 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 40 | 41 | data_augment: 42 | - NAME: random_world_flip 43 | ALONG_AXIS_LIST: [ 'x' ] 44 | 45 | - NAME: random_world_rotation 46 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 47 | 48 | - NAME: random_world_scaling 49 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 50 | 51 | # anchor box related 52 | postprocess: 53 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 54 | anchor_args: 55 | cav_lidar_range: *cav_lidar 56 | l: 3.9 57 | w: 1.6 58 | h: 1.56 59 | r: [0, 90] 60 | feature_stride: 4 61 | num: &achor_num 2 62 | target_args: 63 | pos_threshold: 0.6 64 | neg_threshold: 0.45 65 | score_threshold: 0.20 66 | order: 'hwl' # hwl or lwh 67 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 68 | nms_thresh: 0.15 69 | 70 | # model related 71 | model: 72 | core_method: point_pillar_cobevt 73 | args: 74 | voxel_size: *voxel_size 75 | lidar_range: *cav_lidar 76 | anchor_number: *achor_num 77 | max_cav: *max_cav 78 | compression: 0 # compression rate 79 | backbone_fix: false 80 | 81 | pillar_vfe: 82 | use_norm: true 83 | with_distance: false 84 | use_absolute_xyz: true 85 | num_filters: [64] 86 | point_pillar_scatter: 87 | num_features: 64 88 | 89 | base_bev_backbone: 90 | layer_nums: [3, 5, 8] 91 | layer_strides: [2, 2, 2] 92 | num_filters: [64, 128, 256] 93 | upsample_strides: [1, 2, 4] 94 | num_upsample_filter: [128, 128, 128] 95 | shrink_header: 96 | kernal_size: [3] 97 | stride: [2] 98 | padding: [1] 99 | dim: [256] 100 | input_dim: 384 # 128 * 3 101 | 102 | fax_fusion: 103 | input_dim: 256 104 | mlp_dim: 256 105 | agent_size: *max_cav 106 | window_size: 4 107 | dim_head: 32 108 | drop_out: 0.1 109 | depth: 3 110 | mask: true 111 | 112 | 113 | # add decoder later 114 | 115 | loss: 116 | core_method: point_pillar_loss 117 | args: 118 | cls_weight: 1.0 119 | reg: 2.0 120 | 121 | optimizer: 122 | core_method: Adam 123 | lr: 0.001 124 | args: 125 | eps: 1e-10 126 | weight_decay: 1e-4 127 | 128 | lr_scheduler: 129 | core_method: multistep #step, multistep and Exponential support 130 | gamma: 0.1 131 | step_size: [15, 50] 132 | 133 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_fcooper.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_fcooper 2 | root_dir: 'v2xset/train' 3 | validate_dir: 'v2xset/validate' 4 | wild_setting: 5 | async: false 6 | async_overhead: 100 7 | seed: 20 8 | loc_err: false 9 | xyz_std: 0.2 10 | ryp_std: 0.2 11 | data_size: 1.06 # Mb!! 12 | transmission_speed: 27 # Mbps!! 13 | backbone_delay: 10 # ms 14 | 15 | yaml_parser: "load_point_pillar_params" 16 | train_params: 17 | batch_size: &batch_size 4 18 | epoches: 60 19 | eval_freq: 1 20 | save_freq: 1 21 | max_cav: &max_cav 5 22 | 23 | fusion: 24 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 25 | args: 26 | cur_ego_pose_flag: True 27 | # when the cur_ego_pose_flag is set to True, there is no time gap 28 | # between the time when the LiDAR data is captured by connected 29 | # agents and when the extracted features are received by 30 | # the ego vehicle, which is equal to implement STCM. When set to False, 31 | # STCM has to be used. 32 | 33 | 34 | # preprocess-related 35 | preprocess: 36 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 37 | core_method: 'SpVoxelPreprocessor' 38 | args: 39 | voxel_size: &voxel_size [0.4, 0.4, 4] 40 | max_points_per_voxel: 32 41 | max_voxel_train: 32000 42 | max_voxel_test: 70000 43 | # lidar range for each individual cav. 44 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 45 | 46 | data_augment: 47 | - NAME: random_world_flip 48 | ALONG_AXIS_LIST: [ 'x' ] 49 | 50 | - NAME: random_world_rotation 51 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 52 | 53 | - NAME: random_world_scaling 54 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 55 | 56 | # anchor box related 57 | postprocess: 58 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 59 | anchor_args: 60 | cav_lidar_range: *cav_lidar 61 | l: 3.9 62 | w: 1.6 63 | h: 1.56 64 | r: [0, 90] 65 | feature_stride: 4 66 | num: &achor_num 2 67 | target_args: 68 | pos_threshold: 0.6 69 | neg_threshold: 0.45 70 | score_threshold: 0.20 71 | order: 'hwl' # hwl or lwh 72 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 73 | nms_thresh: 0.15 74 | 75 | # model related 76 | model: 77 | core_method: point_pillar_fcooper 78 | args: 79 | voxel_size: *voxel_size 80 | lidar_range: *cav_lidar 81 | anchor_number: *achor_num 82 | max_cav: *max_cav 83 | compression: 0 # compression rate 84 | backbone_fix: false 85 | 86 | pillar_vfe: 87 | use_norm: true 88 | with_distance: false 89 | use_absolute_xyz: true 90 | num_filters: [64] 91 | point_pillar_scatter: 92 | num_features: 64 93 | 94 | base_bev_backbone: 95 | layer_nums: [3, 5, 8] 96 | layer_strides: [2, 2, 2] 97 | num_filters: [64, 128, 256] 98 | upsample_strides: [1, 2, 4] 99 | num_upsample_filter: [128, 128, 128] 100 | shrink_header: 101 | kernal_size: [3] 102 | stride: [2] 103 | padding: [1] 104 | dim: [256] 105 | input_dim: 384 # 128 * 3 106 | 107 | # add decoder later 108 | 109 | loss: 110 | core_method: point_pillar_loss 111 | args: 112 | cls_weight: 1.0 113 | reg: 2.0 114 | 115 | optimizer: 116 | core_method: Adam 117 | lr: 0.001 118 | args: 119 | eps: 1e-10 120 | weight_decay: 1e-4 121 | 122 | lr_scheduler: 123 | core_method: multistep #step, multistep and Exponential support 124 | gamma: 0.1 125 | step_size: [15, 50] 126 | 127 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/psm_mask.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | class Communication(nn.Module): 7 | def __init__(self, args): 8 | super(Communication, self).__init__() 9 | 10 | self.smooth = False 11 | self.thre = args['thre'] 12 | # if 'gaussian_smooth' in args: 13 | # # Gaussian Smooth 14 | # self.smooth = True 15 | # kernel_size = args['gaussian_smooth']['k_size'] 16 | # c_sigma = args['gaussian_smooth']['c_sigma'] 17 | # self.gaussian_filter = nn.Conv2d(1, 1, kernel_size=kernel_size, 18 | # stride=1, 19 | # padding=(kernel_size - 1) // 2) 20 | # self.init_gaussian_filter(kernel_size, c_sigma) 21 | # self.gaussian_filter.requires_grad = False 22 | 23 | def init_gaussian_filter(self, k_size=5, sigma=1): 24 | def _gen_gaussian_kernel(k_size=5, sigma=1): 25 | center = k_size // 2 26 | x, y = np.mgrid[0 - center: k_size - center, 27 | 0 - center: k_size - center] 28 | g = 1 / (2 * np.pi * sigma) * np.exp( 29 | -(np.square(x) + np.square(y)) / (2 * np.square(sigma))) 30 | return g 31 | 32 | gaussian_kernel = _gen_gaussian_kernel(k_size, sigma) 33 | self.gaussian_filter.weight.data = torch.Tensor(gaussian_kernel).to( 34 | self.gaussian_filter.weight.device).unsqueeze(0).unsqueeze(0) 35 | self.gaussian_filter.bias.data.zero_() 36 | 37 | def forward(self, batch_confidence_maps, record_len): 38 | # batch_confidence_maps:[(L1, H, W), (L2, H, W), ...] 39 | # pairwise_t_matrix: (B,L,L,2,3) 40 | # thre: threshold of objectiveness 41 | # a_ji = (1 - q_i)*q_ji 42 | B = len(record_len) 43 | _, _, H, W = batch_confidence_maps[0].shape 44 | 45 | communication_masks = [] 46 | communication_rates = [] 47 | batch_communication_maps = [] 48 | for b in range(B): 49 | # number of valid agent 50 | N = record_len[b] 51 | 52 | # 在通道方向取max 53 | ori_communication_maps = \ 54 | batch_confidence_maps[b].sigmoid().max(dim=1)[0].unsqueeze(1) # dim1=2 represents the confidence of two anchors 55 | 56 | if self.smooth: 57 | communication_maps = self.gaussian_filter(ori_communication_maps) 58 | else: 59 | communication_maps = ori_communication_maps 60 | 61 | ones_mask = torch.ones_like(communication_maps).to(communication_maps.device) 62 | zeros_mask = torch.zeros_like(communication_maps).to(communication_maps.device) 63 | communication_mask = torch.where(communication_maps > self.thre,ones_mask, zeros_mask) 64 | # 符合thre的部分占有的比例 65 | communication_rate = communication_mask[0].sum()/(H * W) 66 | 67 | ones_mask = torch.ones_like(communication_mask).to( 68 | communication_mask.device) 69 | # communication_mask_nodiag[::2] = ones_mask[::2] 70 | 71 | communication_masks.append(communication_mask) 72 | communication_rates.append(communication_rate) 73 | batch_communication_maps.append( 74 | ori_communication_maps * communication_mask) 75 | communication_rates = sum(communication_rates) / B 76 | # communication_masks = torch.cat(communication_masks, dim=0) 77 | return batch_communication_maps, communication_masks, 78 | 79 | 80 | # def save_mask_0(mask, i, cnt): 81 | # plt.imsave('/data2/gjm/tmp/pi/'+str(cnt)+'_'+str(i)+'.png', mask) -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_v2vnet.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_v2vnet 2 | root_dir: 'v2xset/train' 3 | validate_dir: 'v2xset/validate' 4 | 5 | wild_setting: 6 | async: false 7 | async_overhead: 100 8 | seed: 20 9 | loc_err: false 10 | xyz_std: 0.2 11 | ryp_std: 0.2 12 | data_size: 1.06 # Mb!! 13 | transmission_speed: 27 # Mbps!! 14 | backbone_delay: 10 # ms 15 | 16 | yaml_parser: "load_point_pillar_params" 17 | train_params: 18 | batch_size: &batch_size 4 19 | epoches: 60 20 | eval_freq: 1 21 | save_freq: 1 22 | max_cav: &max_cav 5 23 | 24 | fusion: 25 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 26 | args: 27 | cur_ego_pose_flag: True 28 | 29 | # preprocess-related 30 | preprocess: 31 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 32 | core_method: 'SpVoxelPreprocessor' 33 | args: 34 | voxel_size: &voxel_size [0.4, 0.4, 4] 35 | max_points_per_voxel: 32 36 | max_voxel_train: 32000 37 | max_voxel_test: 70000 38 | # lidar range for each individual cav. 39 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 40 | 41 | data_augment: 42 | - NAME: random_world_flip 43 | ALONG_AXIS_LIST: [ 'x' ] 44 | 45 | - NAME: random_world_rotation 46 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 47 | 48 | - NAME: random_world_scaling 49 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 50 | 51 | # anchor box related 52 | postprocess: 53 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 54 | anchor_args: 55 | cav_lidar_range: *cav_lidar 56 | l: 3.9 57 | w: 1.6 58 | h: 1.56 59 | r: [0, 90] 60 | feature_stride: 4 61 | num: &achor_num 2 62 | target_args: 63 | pos_threshold: 0.6 64 | neg_threshold: 0.45 65 | score_threshold: 0.20 66 | order: 'hwl' # hwl or lwh 67 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 68 | nms_thresh: 0.15 69 | 70 | # model related 71 | model: 72 | core_method: point_pillar_v2vnet 73 | args: 74 | voxel_size: *voxel_size 75 | lidar_range: *cav_lidar 76 | anchor_number: *achor_num 77 | max_cav: *max_cav 78 | compression: 0 # compression rate 79 | backbone_fix: false 80 | 81 | pillar_vfe: 82 | use_norm: true 83 | with_distance: false 84 | use_absolute_xyz: true 85 | num_filters: [64] 86 | point_pillar_scatter: 87 | num_features: 64 88 | 89 | base_bev_backbone: 90 | layer_nums: [3, 5, 8] 91 | layer_strides: [2, 2, 2] 92 | num_filters: [64, 128, 256] 93 | upsample_strides: [1, 2, 4] 94 | num_upsample_filter: [128, 128, 128] 95 | shrink_header: 96 | kernal_size: [3] 97 | stride: [2] 98 | padding: [1] 99 | dim: [256] 100 | input_dim: 384 # 128 * 3 101 | 102 | v2vfusion: 103 | use_temporal_encoding: true 104 | voxel_size: *voxel_size 105 | downsample_rate: 4 106 | num_iteration: 3 107 | in_channels: 256 108 | gru_flag: false 109 | agg_operator: "avg" # max or avg 110 | conv_gru: 111 | H: 48 112 | W: 176 113 | num_layers: 1 114 | kernel_size: [[3,3]] 115 | 116 | 117 | # add decoder later 118 | 119 | loss: 120 | core_method: point_pillar_loss 121 | args: 122 | cls_weight: 1.0 123 | reg: 2.0 124 | 125 | optimizer: 126 | core_method: Adam 127 | lr: 0.001 128 | args: 129 | eps: 1e-10 130 | weight_decay: 1e-4 131 | 132 | lr_scheduler: 133 | core_method: multistep #step, multistep and Exponential support 134 | gamma: 0.1 135 | step_size: [15, 50] 136 | 137 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_when2com.yaml: -------------------------------------------------------------------------------- 1 | name: opv2v_when2comm 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | wild_setting: 6 | async: false 7 | async_overhead: 100 8 | seed: 20 9 | loc_err: True 10 | xyz_std: 0.2 11 | ryp_std: 0.2 12 | # data_size: 1.06 # Mb!! 13 | # transmission_speed: 27 # Mbps!! 14 | # backbone_delay: 10 # ms 15 | 16 | noise_setting: 17 | add_noise: false 18 | args: 19 | pos_std: 0 20 | rot_std: 0 21 | pos_mean: 0 22 | rot_mean: 0 23 | 24 | yaml_parser: "load_point_pillar_params" 25 | train_params: 26 | batch_size: &batch_size 1 27 | epoches: 60 28 | eval_freq: 60 29 | save_freq: 2 30 | max_cav: &max_cav 5 31 | 32 | 33 | fusion: 34 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 35 | args: 36 | cur_ego_pose_flag: true 37 | 38 | # preprocess-related 39 | preprocess: 40 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 41 | core_method: 'SpVoxelPreprocessor' 42 | args: 43 | voxel_size: &voxel_size [0.4, 0.4, 4] 44 | max_points_per_voxel: 32 45 | max_voxel_train: 32000 46 | max_voxel_test: 70000 47 | # lidar range for each individual cav. Format: xyzxyz minmax 48 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] 49 | 50 | data_augment: 51 | - NAME: random_world_flip 52 | ALONG_AXIS_LIST: [ 'x' ] 53 | 54 | - NAME: random_world_rotation 55 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 56 | 57 | - NAME: random_world_scaling 58 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 59 | 60 | # anchor box related 61 | postprocess: 62 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 63 | gt_range: *cav_lidar 64 | anchor_args: 65 | cav_lidar_range: *cav_lidar 66 | l: 3.9 67 | w: 1.6 68 | h: 1.56 69 | r: [0, 90] 70 | feature_stride: 2 71 | num: &achor_num 2 72 | target_args: 73 | pos_threshold: 0.6 74 | neg_threshold: 0.45 75 | score_threshold: 0.2 76 | order: 'hwl' # hwl or lwh 77 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 78 | nms_thresh: 0.15 79 | 80 | # model related 81 | 82 | model: 83 | core_method: point_pillar_when2com 84 | args: 85 | voxel_size: *voxel_size 86 | lidar_range: *cav_lidar 87 | anchor_number: *achor_num 88 | max_cav: *max_cav 89 | compression: 0 # compression rate 90 | backbone_fix: false 91 | 92 | pillar_vfe: 93 | use_norm: true 94 | with_distance: false 95 | use_absolute_xyz: true 96 | num_filters: [64] 97 | point_pillar_scatter: 98 | num_features: 64 99 | 100 | base_bev_backbone: 101 | layer_nums: [3, 5, 8] 102 | layer_strides: [2, 2, 2] 103 | num_filters: [64, 128, 256] 104 | upsample_strides: [1, 2, 4] 105 | num_upsample_filter: [128, 128, 128] 106 | shrink_header: 107 | kernal_size: [ 3 ] 108 | stride: [ 1 ] 109 | padding: [ 1 ] 110 | dim: [ 256 ] 111 | input_dim: 384 # 128 * 3 112 | 113 | v2vfusion: 114 | voxel_size: *voxel_size 115 | downsample_rate: 2 116 | in_channels: 256 117 | H: 100 118 | W: 352 119 | query_size: 32 120 | key_size: 256 121 | mode: 'softmax' 122 | 123 | loss: 124 | core_method: point_pillar_loss 125 | args: 126 | cls_weight: 1.0 127 | reg: 2.0 128 | 129 | optimizer: 130 | core_method: Adam 131 | lr: 0.001 132 | args: 133 | eps: 1e-10 134 | weight_decay: 1e-4 135 | 136 | lr_scheduler: 137 | core_method: multistep #step, multistep and Exponential support 138 | gamma: 0.1 139 | step_size: [10, 30, 50] 140 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_single.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_single 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | wild_setting: 5 | async: false 6 | async_overhead: 100 7 | seed: 20 8 | loc_err: false 9 | xyz_std: 0.2 10 | ryp_std: 0.2 11 | data_size: 1.06 # Mb!! 12 | transmission_speed: 27 # Mbps!! 13 | backbone_delay: 10 # ms 14 | 15 | noise_setting: 16 | add_noise: false 17 | args: 18 | pos_std: 1 19 | rot_std: 0 20 | pos_mean: 0 21 | rot_mean: 0 22 | 23 | yaml_parser: "load_point_pillar_params" 24 | train_params: 25 | batch_size: &batch_size 4 26 | epoches: 60 27 | eval_freq: 1 28 | save_freq: 1 29 | max_cav: &max_cav 5 30 | 31 | fusion: 32 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 33 | args: 34 | cur_ego_pose_flag: True 35 | # when the cur_ego_pose_flag is set to True, there is no time gap 36 | # between the time when the LiDAR data is captured by connected 37 | # agents and when the extracted features are received by 38 | # the ego vehicle, which is equal to implement STCM. When set to False, 39 | # STCM has to be used. 40 | 41 | 42 | # preprocess-related 43 | preprocess: 44 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 45 | core_method: 'SpVoxelPreprocessor' 46 | args: 47 | voxel_size: &voxel_size [0.4, 0.4, 4] 48 | max_points_per_voxel: 32 49 | max_voxel_train: 32000 50 | max_voxel_test: 70000 51 | # lidar range for each individual cav. 52 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 53 | 54 | data_augment: 55 | - NAME: random_world_flip 56 | ALONG_AXIS_LIST: [ 'x' ] 57 | 58 | - NAME: random_world_rotation 59 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 60 | 61 | - NAME: random_world_scaling 62 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 63 | 64 | # anchor box related 65 | postprocess: 66 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 67 | anchor_args: 68 | cav_lidar_range: *cav_lidar 69 | l: 3.9 70 | w: 1.6 71 | h: 1.56 72 | r: [0, 90] 73 | feature_stride: 4 74 | num: &achor_num 2 75 | target_args: 76 | pos_threshold: 0.6 77 | neg_threshold: 0.45 78 | score_threshold: 0.20 79 | order: 'hwl' # hwl or lwh 80 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 81 | nms_thresh: 0.15 82 | 83 | # model related 84 | model: 85 | core_method: point_pillar_single 86 | args: 87 | voxel_size: *voxel_size 88 | lidar_range: *cav_lidar 89 | anchor_number: *achor_num 90 | max_cav: *max_cav 91 | compression: 0 # compression rate 92 | backbone_fix: false 93 | 94 | pillar_vfe: 95 | use_norm: true 96 | with_distance: false 97 | use_absolute_xyz: true 98 | num_filters: [64] 99 | point_pillar_scatter: 100 | num_features: 64 101 | 102 | base_bev_backbone: 103 | layer_nums: [3, 5, 8] 104 | layer_strides: [2, 2, 2] 105 | num_filters: [64, 128, 256] 106 | upsample_strides: [1, 2, 4] 107 | num_upsample_filter: [128, 128, 128] 108 | shrink_header: 109 | kernal_size: [3] 110 | stride: [2] 111 | padding: [1] 112 | dim: [256] 113 | input_dim: 384 # 128 * 3 114 | 115 | # add decoder later 116 | 117 | loss: 118 | core_method: point_pillar_loss 119 | args: 120 | cls_weight: 1.0 121 | reg: 2.0 122 | 123 | optimizer: 124 | core_method: Adam 125 | lr: 0.001 126 | args: 127 | eps: 1e-10 128 | weight_decay: 1e-4 129 | 130 | lr_scheduler: 131 | core_method: multistep #step, multistep and Exponential support 132 | gamma: 0.1 133 | step_size: [15, 50] 134 | 135 | -------------------------------------------------------------------------------- /v2xvit/loss/pixor_loss.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class PixorLoss(nn.Module): 9 | def __init__(self, args): 10 | super(PixorLoss, self).__init__() 11 | self.alpha = args["alpha"] 12 | self.beta = args["beta"] 13 | self.loss_dict = {} 14 | 15 | def forward(self, output_dict, target_dict): 16 | """ 17 | Compute loss for pixor network 18 | Parameters 19 | ---------- 20 | output_dict : dict 21 | The dictionary that contains the output. 22 | 23 | target_dict : dict 24 | The dictionary that contains the target. 25 | 26 | Returns 27 | ------- 28 | total_loss : torch.Tensor 29 | Total loss. 30 | 31 | """ 32 | targets = target_dict["label_map"] 33 | cls_preds, loc_preds = output_dict["cls"], output_dict["reg"] 34 | 35 | cls_targets, loc_targets = targets.split([1, 6], dim=1) 36 | pos_count = cls_targets.sum() 37 | neg_count = (cls_targets == 0).sum() 38 | w1, w2 = neg_count / (pos_count + neg_count), pos_count / ( 39 | pos_count + neg_count) 40 | weights = torch.ones_like(cls_preds.reshape(-1)) 41 | weights[cls_targets.reshape(-1) == 1] = w1 42 | weights[cls_targets.reshape(-1) == 0] = w2 43 | # cls_targets = cls_targets.float() 44 | # cls_loss = F.binary_cross_entropy_with_logits(input=cls_preds.reshape(-1), target=cls_targets.reshape(-1), weight=weights, 45 | # reduction='mean') 46 | cls_loss = F.binary_cross_entropy_with_logits( 47 | input=cls_preds, target=cls_targets, 48 | reduction='mean') 49 | pos_pixels = cls_targets.sum() 50 | 51 | loc_loss = F.smooth_l1_loss(cls_targets * loc_preds, 52 | cls_targets * loc_targets, 53 | reduction='sum') 54 | loc_loss = loc_loss / pos_pixels if pos_pixels > 0 else loc_loss 55 | 56 | total_loss = self.alpha * cls_loss + self.beta * loc_loss 57 | 58 | self.loss_dict.update({'total_loss': total_loss, 59 | 'reg_loss': loc_loss, 60 | 'cls_loss': cls_loss}) 61 | 62 | return total_loss 63 | 64 | def logging(self, epoch, batch_id, batch_len, writer): 65 | """ 66 | Print out the loss function for current iteration. 67 | 68 | Parameters 69 | ---------- 70 | epoch : int 71 | Current epoch for training. 72 | batch_id : int 73 | The current batch. 74 | batch_len : int 75 | Total batch length in one iteration of training, 76 | writer : SummaryWriter 77 | Used to visualize on tensorboard 78 | """ 79 | total_loss = self.loss_dict['total_loss'] 80 | reg_loss = self.loss_dict['reg_loss'] 81 | cls_loss = self.loss_dict['cls_loss'] 82 | 83 | print("[epoch %d][%d/%d], || Loss: %.4f || cls Loss: %.4f" 84 | " || reg Loss: %.4f" % ( 85 | epoch, batch_id + 1, batch_len, 86 | total_loss.item(), cls_loss.item(), reg_loss.item())) 87 | 88 | writer.add_scalar('Regression_loss', reg_loss.item(), 89 | epoch * batch_len + batch_id) 90 | writer.add_scalar('Confidence_loss', cls_loss.item(), 91 | epoch * batch_len + batch_id) 92 | 93 | 94 | def test(): 95 | torch.manual_seed(0) 96 | loss = PixorLoss(None) 97 | pred = torch.sigmoid(torch.randn(1, 7, 2, 3)) 98 | label = torch.zeros(1, 7, 2, 3) 99 | loss = loss(pred, label) 100 | print(loss) 101 | 102 | 103 | if __name__ == "__main__": 104 | test() 105 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_opv2v.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_opv2v 2 | #root_dir: '/home/runshengxu/project/Cooperative_perception/opencood/tmp' 3 | root_dir: '/data/opv2v/train' 4 | validate_dir: '/data/opv2v/validate' 5 | 6 | wild_setting: 7 | async: false 8 | async_overhead: 100 9 | seed: 20 10 | loc_err: false 11 | xyz_std: 0.2 12 | ryp_std: 0.2 13 | data_size: 1.06 # Mb!! 14 | transmission_speed: 27 # Mbps!! 15 | backbone_delay: 10 # ms 16 | 17 | noise_setting: 18 | add_noise: True 19 | args: 20 | pos_std: 0 21 | rot_std: 0 22 | pos_mean: 0 23 | rot_mean: 0 24 | 25 | yaml_parser: "load_point_pillar_params" 26 | train_params: 27 | batch_size: &batch_size 2 28 | epoches: 60 29 | eval_freq: 1 30 | save_freq: 1 31 | max_cav: &max_cav 5 32 | 33 | fusion: 34 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 35 | args: 36 | cur_ego_pose_flag: True 37 | # when the cur_ego_pose_flag is set to True, there is no time gap 38 | # between the time when the LiDAR data is captured by connected 39 | # agents and when the extracted features are received by 40 | # the ego vehicle, which is equal to implement STCM. When set to False, 41 | # STCM has to be used. 42 | 43 | 44 | # preprocess-related 45 | preprocess: 46 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 47 | core_method: 'SpVoxelPreprocessor' 48 | args: 49 | voxel_size: &voxel_size [0.4, 0.4, 4] 50 | max_points_per_voxel: 32 51 | max_voxel_train: 32000 52 | max_voxel_test: 70000 53 | # lidar range for each individual cav. 54 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 55 | 56 | data_augment: 57 | - NAME: random_world_flip 58 | ALONG_AXIS_LIST: [ 'x' ] 59 | 60 | - NAME: random_world_rotation 61 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 62 | 63 | - NAME: random_world_scaling 64 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 65 | 66 | # anchor box related 67 | postprocess: 68 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 69 | anchor_args: 70 | cav_lidar_range: *cav_lidar 71 | l: 3.9 72 | w: 1.6 73 | h: 1.56 74 | r: [0, 90] 75 | feature_stride: 4 76 | num: &achor_num 2 77 | target_args: 78 | pos_threshold: 0.6 79 | neg_threshold: 0.45 80 | score_threshold: 0.20 81 | order: 'hwl' # hwl or lwh 82 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 83 | nms_thresh: 0.15 84 | 85 | # model related 86 | model: 87 | core_method: point_pillar_opv2v 88 | args: 89 | voxel_size: *voxel_size 90 | lidar_range: *cav_lidar 91 | anchor_number: *achor_num 92 | max_cav: *max_cav 93 | compression: 32 # compression rate 94 | backbone_fix: false 95 | 96 | pillar_vfe: 97 | use_norm: true 98 | with_distance: false 99 | use_absolute_xyz: true 100 | num_filters: [64] 101 | point_pillar_scatter: 102 | num_features: 64 103 | 104 | base_bev_backbone: 105 | layer_nums: [3, 5, 8] 106 | layer_strides: [2, 2, 2] 107 | num_filters: [64, 128, 256] 108 | upsample_strides: [1, 2, 4] 109 | num_upsample_filter: [128, 128, 128] 110 | shrink_header: 111 | kernal_size: [3] 112 | stride: [2] 113 | padding: [1] 114 | dim: [256] 115 | input_dim: 384 # 128 * 3 116 | 117 | # add decoder later 118 | 119 | loss: 120 | core_method: point_pillar_loss 121 | args: 122 | cls_weight: 1.0 123 | reg: 2.0 124 | 125 | optimizer: 126 | core_method: Adam 127 | lr: 0.001 128 | args: 129 | eps: 1e-10 130 | weight_decay: 1e-4 131 | 132 | lr_scheduler: 133 | core_method: multistep #step, multistep and Exponential support 134 | gamma: 0.1 135 | step_size: [15, 50] 136 | 137 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_where2comm_ori.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_where2comm 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | 6 | wild_setting: 7 | seed: 20 8 | async: False 9 | async_mode: 'sim' 10 | async_overhead: 100 11 | loc_err: False 12 | xyz_std: 0.2 13 | ryp_std: 0.2 14 | data_size: 1.06 # Mb 15 | transmission_speed: 27 # Mbps 16 | backbone_delay: 10 # ms 17 | 18 | yaml_parser: 'load_point_pillar_params' 19 | train_params: 20 | batch_size: &batch_size 4 21 | epoches: &epoches 50 22 | eval_freq: 1 23 | save_freq: 1 24 | max_cav: &max_cav 5 25 | 26 | fusion: 27 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 28 | args: 29 | cur_ego_pose_flag: true 30 | 31 | # Preprocess-related 32 | preprocess: 33 | # Options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 34 | core_method: 'SpVoxelPreprocessor' 35 | args: 36 | voxel_size: &voxel_size [ 0.4, 0.4, 4 ] 37 | max_points_per_voxel: 32 38 | max_voxel_train: 32000 39 | max_voxel_test: 70000 40 | # LiDAR range for each individual CAV 41 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] # [-140.8, -38.4, -5, 140.8, 38.4, 3] 42 | 43 | data_augment: 44 | - NAME: random_world_flip 45 | ALONG_AXIS_LIST: [ 'x' ] 46 | - NAME: random_world_rotation 47 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 48 | - NAME: random_world_scaling 49 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 50 | 51 | # Anchor box related 52 | postprocess: 53 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 54 | anchor_args: 55 | cav_lidar_range: *cav_lidar 56 | l: 3.9 57 | w: 1.6 58 | h: 1.56 59 | r: [ 0, 90 ] 60 | num: &achor_num 2 61 | feature_stride: 4 62 | target_args: 63 | pos_threshold: 0.6 64 | neg_threshold: 0.45 65 | score_threshold: 0.2 66 | order: 'hwl' # hwl or lwh 67 | max_num: 100 # Maximum number of objects in a single frame. Use this number to make sure different frames has the same dimension in the same batch 68 | nms_thresh: 0.15 69 | 70 | # Model related 71 | model: 72 | core_method: point_pillar_where2comm_ori # point_pillar_where2comm_ori 73 | args: 74 | voxel_size: *voxel_size 75 | lidar_range: *cav_lidar 76 | anchor_number: *achor_num 77 | head_dim: 256 78 | max_cav: *max_cav 79 | compression: 0 # Compression rate 80 | backbone_fix: False 81 | pillar_vfe: 82 | use_norm: True 83 | with_distance: False 84 | use_absolute_xyz: True 85 | num_filters: [ 64 ] 86 | point_pillar_scatter: 87 | num_features: 64 88 | base_bev_backbone: 89 | layer_nums: &layer_nums [3, 4, 5] # [3, 5, 8] 90 | layer_strides: [ 2, 2, 2 ] 91 | num_filters: &num_filters [ 64, 128, 256 ] 92 | upsample_strides: [ 1, 2, 4 ] 93 | num_upsample_filter: [ 128, 128, 128 ] 94 | shrink_header: 95 | kernal_size: [ 3 ] 96 | stride: [ 2 ] 97 | padding: [ 1 ] 98 | dim: [ 256 ] 99 | input_dim: 384 # 128 * 3 100 | where2comm_fusion: 101 | fully: False 102 | voxel_size: *voxel_size 103 | downsample_rate: 4 104 | in_channels: 256 105 | multi_scale: True 106 | layer_nums: *layer_nums 107 | num_filters: *num_filters 108 | communication: 109 | round: 1 110 | threshold: 0.01 111 | gaussian_smooth: 112 | k_size: 5 113 | c_sigma: 1.0 114 | 115 | loss: 116 | core_method: point_pillar_loss 117 | args: 118 | cls_weight: 1.0 119 | reg: 2.0 120 | 121 | optimizer: 122 | core_method: Adam 123 | lr: 2e-4 124 | args: 125 | eps: 1e-10 126 | weight_decay: 1e-2 127 | 128 | lr_scheduler: 129 | # core_method: cosineannealwarm # step, multistep, exponential and cosineannealwarm support 130 | # epoches: *epoches 131 | # warmup_lr: 2e-5 132 | # warmup_epoches: 10 133 | # lr_min: 5e-6 134 | core_method: multistep #step, multistep and Exponential support 135 | gamma: 0.1 136 | step_size: [10, 20] -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_where2comm_ori_multi.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_where2comm 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | 6 | wild_setting: 7 | seed: 20 8 | async: False 9 | async_mode: 'sim' 10 | async_overhead: 100 11 | loc_err: False 12 | xyz_std: 0.2 13 | ryp_std: 0.2 14 | data_size: 1.06 # Mb 15 | transmission_speed: 27 # Mbps 16 | backbone_delay: 10 # ms 17 | 18 | yaml_parser: 'load_point_pillar_params' 19 | train_params: 20 | batch_size: &batch_size 4 21 | epoches: &epoches 30 22 | eval_freq: 1 23 | save_freq: 1 24 | max_cav: &max_cav 5 25 | 26 | fusion: 27 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 28 | args: 29 | cur_ego_pose_flag: true 30 | 31 | # Preprocess-related 32 | preprocess: 33 | # Options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 34 | core_method: 'SpVoxelPreprocessor' 35 | args: 36 | voxel_size: &voxel_size [ 0.4, 0.4, 4 ] 37 | max_points_per_voxel: 32 38 | max_voxel_train: 32000 39 | max_voxel_test: 70000 40 | # LiDAR range for each individual CAV 41 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] # [-140.8, -38.4, -5, 140.8, 38.4, 3] 42 | 43 | data_augment: 44 | - NAME: random_world_flip 45 | ALONG_AXIS_LIST: [ 'x' ] 46 | - NAME: random_world_rotation 47 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 48 | - NAME: random_world_scaling 49 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 50 | 51 | # Anchor box related 52 | postprocess: 53 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 54 | anchor_args: 55 | cav_lidar_range: *cav_lidar 56 | l: 3.9 57 | w: 1.6 58 | h: 1.56 59 | r: [ 0, 90 ] 60 | num: &achor_num 2 61 | feature_stride: 4 62 | target_args: 63 | pos_threshold: 0.6 64 | neg_threshold: 0.45 65 | score_threshold: 0.2 66 | order: 'hwl' # hwl or lwh 67 | max_num: 100 # Maximum number of objects in a single frame. Use this number to make sure different frames has the same dimension in the same batch 68 | nms_thresh: 0.15 69 | 70 | # Model related 71 | model: 72 | core_method: point_pillar_where2comm_ori # point_pillar_where2comm_ori 73 | args: 74 | voxel_size: *voxel_size 75 | lidar_range: *cav_lidar 76 | anchor_number: *achor_num 77 | head_dim: 256 78 | max_cav: *max_cav 79 | compression: 0 # Compression rate 80 | backbone_fix: False 81 | pillar_vfe: 82 | use_norm: True 83 | with_distance: False 84 | use_absolute_xyz: True 85 | num_filters: [ 64 ] 86 | point_pillar_scatter: 87 | num_features: 64 88 | base_bev_backbone: 89 | layer_nums: &layer_nums [3, 4, 5] # [3, 5, 8] 90 | layer_strides: [ 2, 2, 2 ] 91 | num_filters: &num_filters [ 64, 128, 256 ] 92 | upsample_strides: [ 1, 2, 4 ] 93 | num_upsample_filter: [ 128, 128, 128 ] 94 | shrink_header: 95 | kernal_size: [ 3 ] 96 | stride: [ 2 ] 97 | padding: [ 1 ] 98 | dim: [ 256 ] 99 | input_dim: 384 # 128 * 3 100 | where2comm_fusion: 101 | fully: False 102 | voxel_size: *voxel_size 103 | downsample_rate: 4 104 | in_channels: 256 105 | multi_scale: True 106 | layer_nums: *layer_nums 107 | num_filters: *num_filters 108 | communication: 109 | round: 1 110 | threshold: 0.01 111 | gaussian_smooth: 112 | k_size: 5 113 | c_sigma: 1.0 114 | 115 | loss: 116 | core_method: point_pillar_loss 117 | args: 118 | cls_weight: 1.0 119 | reg: 2.0 120 | 121 | optimizer: 122 | core_method: Adam 123 | lr: 2e-4 124 | args: 125 | eps: 1e-10 126 | weight_decay: 1e-2 127 | 128 | lr_scheduler: 129 | # core_method: cosineannealwarm # step, multistep, exponential and cosineannealwarm support 130 | # epoches: *epoches 131 | # warmup_lr: 2e-5 132 | # warmup_epoches: 10 133 | # lr_min: 5e-6 134 | core_method: multistep #step, multistep and Exponential support 135 | gamma: 0.1 136 | step_size: [10, 20] -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_where2comm_ori_single.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_where2comm 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | 6 | wild_setting: 7 | seed: 20 8 | async: False 9 | async_mode: 'sim' 10 | async_overhead: 100 11 | loc_err: False 12 | xyz_std: 0.2 13 | ryp_std: 0.2 14 | data_size: 1.06 # Mb 15 | transmission_speed: 27 # Mbps 16 | backbone_delay: 10 # ms 17 | 18 | yaml_parser: 'load_point_pillar_params' 19 | train_params: 20 | batch_size: &batch_size 4 21 | epoches: &epoches 30 22 | eval_freq: 1 23 | save_freq: 1 24 | max_cav: &max_cav 5 25 | 26 | fusion: 27 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 28 | args: 29 | cur_ego_pose_flag: true 30 | 31 | # Preprocess-related 32 | preprocess: 33 | # Options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 34 | core_method: 'SpVoxelPreprocessor' 35 | args: 36 | voxel_size: &voxel_size [ 0.4, 0.4, 4 ] 37 | max_points_per_voxel: 32 38 | max_voxel_train: 32000 39 | max_voxel_test: 70000 40 | # LiDAR range for each individual CAV 41 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] # [-140.8, -38.4, -5, 140.8, 38.4, 3] 42 | 43 | data_augment: 44 | - NAME: random_world_flip 45 | ALONG_AXIS_LIST: [ 'x' ] 46 | - NAME: random_world_rotation 47 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 48 | - NAME: random_world_scaling 49 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 50 | 51 | # Anchor box related 52 | postprocess: 53 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 54 | anchor_args: 55 | cav_lidar_range: *cav_lidar 56 | l: 3.9 57 | w: 1.6 58 | h: 1.56 59 | r: [ 0, 90 ] 60 | num: &achor_num 2 61 | feature_stride: 4 62 | target_args: 63 | pos_threshold: 0.6 64 | neg_threshold: 0.45 65 | score_threshold: 0.2 66 | order: 'hwl' # hwl or lwh 67 | max_num: 100 # Maximum number of objects in a single frame. Use this number to make sure different frames has the same dimension in the same batch 68 | nms_thresh: 0.15 69 | 70 | # Model related 71 | model: 72 | core_method: point_pillar_where2comm_ori # point_pillar_where2comm_ori 73 | args: 74 | voxel_size: *voxel_size 75 | lidar_range: *cav_lidar 76 | anchor_number: *achor_num 77 | head_dim: 256 78 | max_cav: *max_cav 79 | compression: 0 # Compression rate 80 | backbone_fix: False 81 | pillar_vfe: 82 | use_norm: True 83 | with_distance: False 84 | use_absolute_xyz: True 85 | num_filters: [ 64 ] 86 | point_pillar_scatter: 87 | num_features: 64 88 | base_bev_backbone: 89 | layer_nums: &layer_nums [3, 4, 5] # [3, 5, 8] 90 | layer_strides: [ 2, 2, 2 ] 91 | num_filters: &num_filters [ 64, 128, 256 ] 92 | upsample_strides: [ 1, 2, 4 ] 93 | num_upsample_filter: [ 128, 128, 128 ] 94 | shrink_header: 95 | kernal_size: [ 3 ] 96 | stride: [ 2 ] 97 | padding: [ 1 ] 98 | dim: [ 256 ] 99 | input_dim: 384 # 128 * 3 100 | where2comm_fusion: 101 | fully: False 102 | voxel_size: *voxel_size 103 | downsample_rate: 4 104 | in_channels: 256 105 | multi_scale: False # True 106 | layer_nums: *layer_nums 107 | num_filters: *num_filters 108 | communication: 109 | round: 1 110 | threshold: 0.01 111 | gaussian_smooth: 112 | k_size: 5 113 | c_sigma: 1.0 114 | 115 | loss: 116 | core_method: point_pillar_loss 117 | args: 118 | cls_weight: 1.0 119 | reg: 2.0 120 | 121 | optimizer: 122 | core_method: Adam 123 | lr: 2e-4 124 | args: 125 | eps: 1e-10 126 | weight_decay: 1e-2 127 | 128 | lr_scheduler: 129 | # core_method: cosineannealwarm # step, multistep, exponential and cosineannealwarm support 130 | # epoches: *epoches 131 | # warmup_lr: 2e-5 132 | # warmup_epoches: 10 133 | # lr_min: 5e-6 134 | core_method: multistep #step, multistep and Exponential support 135 | gamma: 0.1 136 | step_size: [10, 20] -------------------------------------------------------------------------------- /v2xvit/utils/pose_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.distributions as dist 4 | 5 | def add_noise_data_dict(data_dict, noise_setting): 6 | """ Update the base data dict. 7 | We retrieve lidar_pose and add_noise to it. 8 | And set a clean pose. 9 | """ 10 | # path = 'pose_noise.txt' 11 | # file = open(path,'w+') 12 | if noise_setting['add_noise']: 13 | for cav_id, cav_content in data_dict.items(): 14 | cav_content['params']['lidar_pose_clean'] = cav_content['params']['lidar_pose'] # 6 dof pose 15 | if cav_content['ego']: 16 | continue 17 | pose_noise = generate_noise( 18 | noise_setting['args']['pos_std'], 19 | noise_setting['args']['rot_std'], 20 | noise_setting['args']['pos_mean'], 21 | noise_setting['args']['rot_mean'] 22 | ) 23 | cav_content['params']['lidar_pose'] = cav_content['params']['lidar_pose'] + pose_noise 24 | # print(pose_noise) 25 | # file.write(str(pose_noise[0])+' ') 26 | else: 27 | for cav_id, cav_content in data_dict.items(): 28 | cav_content['params']['lidar_pose_clean'] = cav_content['params']['lidar_pose'] # 6 dof pose 29 | 30 | # file.write('\n') 31 | # file.close() 32 | return data_dict 33 | 34 | def generate_noise(pos_std, rot_std, pos_mean=0, rot_mean=0): 35 | """ Add localization error to the 6dof pose 36 | Noise includes position (x,y) and rotation (yaw). 37 | We use gaussian distribution to generate noise. 38 | 39 | Args: 40 | 41 | pos_std : float 42 | std of gaussian dist, in meter 43 | 44 | rot_std : float 45 | std of gaussian dist, in degree 46 | 47 | pos_mean : float 48 | mean of gaussian dist, in meter 49 | 50 | rot_mean : float 51 | mean of gaussian dist, in degree 52 | 53 | Returns: 54 | pose_noise: np.ndarray, [6,] 55 | [x, y, z, roll, yaw, pitch] 56 | """ 57 | 58 | xy = np.random.normal(pos_mean, pos_std, size=(2)) 59 | yaw = np.random.normal(rot_mean, rot_std, size=(1)) 60 | 61 | pose_noise = np.array([xy[0], xy[1], 0, 0, yaw[0], 0]) 62 | # pose_noise = np.array([xy[0], xy[1], 0, 0, 0, yaw[0]]) 63 | # print(pose_noise) 64 | return pose_noise 65 | 66 | 67 | 68 | def generate_noise_torch(pose, pos_std, rot_std, pos_mean=0, rot_mean=0): 69 | """ only used for v2vnet robust. 70 | rotation noise is sampled from von_mises distribution 71 | 72 | Args: 73 | pose : Tensor, [N. 6] 74 | including [x, y, z, roll, yaw, pitch] 75 | 76 | pos_std : float 77 | std of gaussian dist, in meter 78 | 79 | rot_std : float 80 | std of gaussian dist, in degree 81 | 82 | pos_mean : float 83 | mean of gaussian dist, in meter 84 | 85 | rot_mean : float 86 | mean of gaussian dist, in degree 87 | 88 | Returns: 89 | pose_noisy: Tensor, [N, 6] 90 | noisy pose 91 | """ 92 | 93 | N = pose.shape[0] 94 | noise = torch.zeros_like(pose, device=pose.device) 95 | concentration = (180 / (np.pi * rot_std)) ** 2 96 | 97 | noise[:, :2] = torch.normal(pos_mean, pos_std, size=(N, 2), device=pose.device) 98 | noise[:, 4] = dist.von_mises.VonMises(loc=rot_mean, concentration=concentration).sample((N,)).to(noise.device) 99 | 100 | 101 | return noise 102 | 103 | 104 | def remove_z_axis(T): 105 | """ remove rotation/translation related to z-axis 106 | Args: 107 | T: np.ndarray 108 | [4, 4] 109 | Returns: 110 | T: np.ndarray 111 | [4, 4] 112 | """ 113 | T[2,3] = 0 # z-trans 114 | T[0,2] = 0 115 | T[1,2] = 0 116 | T[2,0] = 0 117 | T[2,1] = 0 118 | T[2,2] = 1 119 | 120 | return T -------------------------------------------------------------------------------- /v2xvit/data_utils/pre_processor/bev_preprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert lidar to bev 3 | """ 4 | 5 | import numpy as np 6 | import torch 7 | from v2xvit.data_utils.pre_processor.base_preprocessor import \ 8 | BasePreprocessor 9 | 10 | class BevPreprocessor(BasePreprocessor): 11 | def __init__(self, preprocess_params, train): 12 | super(BevPreprocessor, self).__init__(preprocess_params, train) 13 | self.lidar_range = self.params['cav_lidar_range'] 14 | self.geometry_param = preprocess_params["geometry_param"] 15 | 16 | def preprocess(self, pcd_raw): 17 | """ 18 | Preprocess the lidar points to BEV representations. 19 | 20 | Parameters 21 | ---------- 22 | pcd_raw : np.ndarray 23 | The raw lidar. 24 | 25 | Returns 26 | ------- 27 | data_dict : the structured output dictionary. 28 | """ 29 | bev = np.zeros(self.geometry_param['input_shape'], dtype=np.float32) 30 | intensity_map_count = np.zeros((bev.shape[0], bev.shape[1]), dtype=np.int) 31 | bev_origin = np.array( 32 | [self.geometry_param["L1"], self.geometry_param["W1"], 33 | self.geometry_param["H1"]]).reshape(1, -1) 34 | 35 | indices = ((pcd_raw[:, :3] - bev_origin) / self.geometry_param[ 36 | "res"]).astype(int) 37 | ## bev[indices[:, 0], indices[:, 1], indices[:, 2]] = 1 38 | # np.add.at(bev, (indices[:, 0], indices[:, 1], indices[:, 2]), 1) 39 | # bev[indices[:, 0], indices[:, 1], -1] += pcd_raw[:, 3] 40 | # intensity_map_count[indices[:, 0], indices[:, 1]] += 1 41 | 42 | for i in range(indices.shape[0]): 43 | bev[indices[i, 0], indices[i, 1], indices[i, 2]] = 1 44 | bev[indices[i, 0], indices[i, 1], -1] += pcd_raw[i, 3] 45 | intensity_map_count[indices[i, 0], indices[i, 1]] += 1 46 | divide_mask = intensity_map_count!=0 47 | bev[divide_mask, -1] = np.divide(bev[divide_mask, -1], intensity_map_count[divide_mask]) 48 | 49 | data_dict = { 50 | "bev_input": np.transpose(bev, (2, 0, 1)) 51 | } 52 | return data_dict 53 | 54 | @staticmethod 55 | def collate_batch_list(batch): 56 | """ 57 | Customized pytorch data loader collate function. 58 | 59 | Parameters 60 | ---------- 61 | batch : list 62 | List of dictionary. Each dictionary represent a single frame. 63 | 64 | Returns 65 | ------- 66 | processed_batch : dict 67 | Updated lidar batch. 68 | """ 69 | bev_input_list = [ 70 | x["bev_input"][np.newaxis, ...] for x in batch 71 | ] 72 | processed_batch = { 73 | "bev_input": torch.from_numpy( 74 | np.concatenate(bev_input_list, axis=0)) 75 | } 76 | return processed_batch 77 | @staticmethod 78 | def collate_batch_dict(batch): 79 | """ 80 | Customized pytorch data loader collate function. 81 | 82 | Parameters 83 | ---------- 84 | batch : dict 85 | Dict of list. Each element represents a CAV. 86 | 87 | Returns 88 | ------- 89 | processed_batch : dict 90 | Updated lidar batch. 91 | """ 92 | bev_input_list = [ 93 | x[np.newaxis, ...] for x in batch["bev_input"] 94 | ] 95 | processed_batch = { 96 | "bev_input": torch.from_numpy( 97 | np.concatenate(bev_input_list, axis=0)) 98 | } 99 | return processed_batch 100 | 101 | def collate_batch(self, batch): 102 | """ 103 | Customized pytorch data loader collate function. 104 | 105 | Parameters 106 | ---------- 107 | batch : list / dict 108 | Batched data. 109 | Returns 110 | ------- 111 | processed_batch : dict 112 | Updated lidar batch. 113 | """ 114 | if isinstance(batch, list): 115 | return self.collate_batch_list(batch) 116 | elif isinstance(batch, dict): 117 | return self.collate_batch_dict(batch) 118 | else: 119 | raise NotImplemented 120 | 121 | -------------------------------------------------------------------------------- /v2xvit/data_utils/augmentor/data_augmentor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for data augmentation 3 | """ 4 | from functools import partial 5 | 6 | from v2xvit.data_utils.augmentor import augment_utils 7 | 8 | 9 | class DataAugmentor(object): 10 | """ 11 | Data Augmentor. 12 | 13 | Parameters 14 | ---------- 15 | augment_config : list 16 | A list of augmentation configuration. 17 | 18 | Attributes 19 | ---------- 20 | data_augmentor_queue : list 21 | The list of data augmented functions. 22 | """ 23 | 24 | def __init__(self, augment_config, train=True): 25 | self.data_augmentor_queue = [] 26 | self.train = train 27 | 28 | for cur_cfg in augment_config: 29 | cur_augmentor = getattr(self, cur_cfg['NAME'])(config=cur_cfg) 30 | self.data_augmentor_queue.append(cur_augmentor) 31 | 32 | def random_world_flip(self, data_dict=None, config=None): 33 | if data_dict is None: 34 | return partial(self.random_world_flip, config=config) 35 | 36 | gt_boxes, gt_mask, points = data_dict['object_bbx_center'], \ 37 | data_dict['object_bbx_mask'], \ 38 | data_dict['lidar_np'] 39 | gt_boxes_valid = gt_boxes[gt_mask == 1] 40 | 41 | for cur_axis in config['ALONG_AXIS_LIST']: 42 | assert cur_axis in ['x', 'y'] 43 | gt_boxes_valid, points = getattr(augment_utils, 44 | 'random_flip_along_%s' % cur_axis)( 45 | gt_boxes_valid, points, 46 | ) 47 | 48 | gt_boxes[:gt_boxes_valid.shape[0], :] = gt_boxes_valid 49 | 50 | data_dict['object_bbx_center'] = gt_boxes 51 | data_dict['object_bbx_mask'] = gt_mask 52 | data_dict['lidar_np'] = points 53 | 54 | return data_dict 55 | 56 | def random_world_rotation(self, data_dict=None, config=None): 57 | if data_dict is None: 58 | return partial(self.random_world_rotation, config=config) 59 | 60 | rot_range = config['WORLD_ROT_ANGLE'] 61 | if not isinstance(rot_range, list): 62 | rot_range = [-rot_range, rot_range] 63 | 64 | gt_boxes, gt_mask, points = data_dict['object_bbx_center'], \ 65 | data_dict['object_bbx_mask'], \ 66 | data_dict['lidar_np'] 67 | gt_boxes_valid = gt_boxes[gt_mask == 1] 68 | gt_boxes_valid, points = augment_utils.global_rotation( 69 | gt_boxes_valid, points, rot_range=rot_range 70 | ) 71 | gt_boxes[:gt_boxes_valid.shape[0], :] = gt_boxes_valid 72 | 73 | data_dict['object_bbx_center'] = gt_boxes 74 | data_dict['object_bbx_mask'] = gt_mask 75 | data_dict['lidar_np'] = points 76 | 77 | return data_dict 78 | 79 | def random_world_scaling(self, data_dict=None, config=None): 80 | if data_dict is None: 81 | return partial(self.random_world_scaling, config=config) 82 | 83 | gt_boxes, gt_mask, points = data_dict['object_bbx_center'], \ 84 | data_dict['object_bbx_mask'], \ 85 | data_dict['lidar_np'] 86 | gt_boxes_valid = gt_boxes[gt_mask == 1] 87 | 88 | gt_boxes_valid, points = augment_utils.global_scaling( 89 | gt_boxes_valid, points, config['WORLD_SCALE_RANGE'] 90 | ) 91 | gt_boxes[:gt_boxes_valid.shape[0], :] = gt_boxes_valid 92 | 93 | data_dict['object_bbx_center'] = gt_boxes 94 | data_dict['object_bbx_mask'] = gt_mask 95 | data_dict['lidar_np'] = points 96 | 97 | return data_dict 98 | 99 | def forward(self, data_dict): 100 | """ 101 | Args: 102 | data_dict: 103 | points: (N, 3 + C_in) 104 | gt_boxes: optional, (N, 7) [x, y, z, dx, dy, dz, heading] 105 | gt_names: optional, (N), string 106 | ... 107 | 108 | Returns: 109 | """ 110 | if self.train: 111 | for cur_augmentor in self.data_augmentor_queue: 112 | data_dict = cur_augmentor(data_dict=data_dict) 113 | 114 | return data_dict 115 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_fcooper.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 4 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 5 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 6 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 7 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 8 | from v2xvit.models.sub_modules.f_cooper_fuse import SpatialFusion 9 | 10 | 11 | class PointPillarFCooper(nn.Module): 12 | def __init__(self, args): 13 | super(PointPillarFCooper, self).__init__() 14 | 15 | self.max_cav = args['max_cav'] 16 | # PIllar VFE 17 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 18 | num_point_features=4, 19 | voxel_size=args['voxel_size'], 20 | point_cloud_range=args['lidar_range']) 21 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 22 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 23 | # used to downsample the feature map for efficient computation 24 | self.shrink_flag = False 25 | if 'shrink_header' in args: 26 | self.shrink_flag = True 27 | self.shrink_conv = DownsampleConv(args['shrink_header']) 28 | self.compression = False 29 | 30 | if args['compression'] > 0: 31 | self.compression = True 32 | self.naive_compressor = NaiveCompressor(256, args['compression']) 33 | 34 | self.fusion_net = SpatialFusion() 35 | 36 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 37 | kernel_size=1) 38 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 39 | kernel_size=1) 40 | 41 | if args['backbone_fix']: 42 | self.backbone_fix() 43 | 44 | def backbone_fix(self): 45 | """ 46 | Fix the parameters of backbone during finetune on timedelay。 47 | """ 48 | for p in self.pillar_vfe.parameters(): 49 | p.requires_grad = False 50 | 51 | for p in self.scatter.parameters(): 52 | p.requires_grad = False 53 | 54 | for p in self.backbone.parameters(): 55 | p.requires_grad = False 56 | 57 | if self.compression: 58 | for p in self.naive_compressor.parameters(): 59 | p.requires_grad = False 60 | if self.shrink_flag: 61 | for p in self.shrink_conv.parameters(): 62 | p.requires_grad = False 63 | 64 | for p in self.cls_head.parameters(): 65 | p.requires_grad = False 66 | for p in self.reg_head.parameters(): 67 | p.requires_grad = False 68 | 69 | def forward(self, data_dict): 70 | voxel_features = data_dict['processed_lidar']['voxel_features'] 71 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 72 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 73 | record_len = data_dict['record_len'] 74 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 75 | 76 | batch_dict = {'voxel_features': voxel_features, 77 | 'voxel_coords': voxel_coords, 78 | 'voxel_num_points': voxel_num_points, 79 | 'record_len': record_len} 80 | # n, 4 -> n, c 81 | batch_dict = self.pillar_vfe(batch_dict) 82 | # n, c -> N, C, H, W 83 | batch_dict = self.scatter(batch_dict) 84 | batch_dict = self.backbone(batch_dict) 85 | 86 | spatial_features_2d = batch_dict['spatial_features_2d'] 87 | # downsample feature to reduce memory 88 | if self.shrink_flag: 89 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 90 | # compressor 91 | if self.compression: 92 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 93 | 94 | fused_feature = self.fusion_net(spatial_features_2d, record_len) 95 | 96 | psm = self.cls_head(fused_feature) 97 | rm = self.reg_head(fused_feature) 98 | 99 | output_dict = {'psm': psm, 100 | 'rm': rm} 101 | 102 | return output_dict 103 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/base_transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from einops import rearrange 5 | 6 | 7 | class PreNormResidual(nn.Module): 8 | def __init__(self, dim, fn): 9 | super().__init__() 10 | self.norm = nn.LayerNorm(dim) 11 | self.fn = fn 12 | 13 | def forward(self, x, **kwargs): 14 | return self.fn(self.norm(x), **kwargs) + x 15 | 16 | 17 | class PreNorm(nn.Module): 18 | def __init__(self, dim, fn): 19 | super().__init__() 20 | self.norm = nn.LayerNorm(dim) 21 | self.fn = fn 22 | 23 | def forward(self, x, **kwargs): 24 | return self.fn(self.norm(x), **kwargs) 25 | 26 | 27 | class FeedForward(nn.Module): 28 | def __init__(self, dim, hidden_dim, dropout=0.): 29 | super().__init__() 30 | self.net = nn.Sequential( 31 | nn.Linear(dim, hidden_dim), 32 | nn.GELU(), 33 | nn.Dropout(dropout), 34 | nn.Linear(hidden_dim, dim), 35 | nn.Dropout(dropout) 36 | ) 37 | 38 | def forward(self, x): 39 | return self.net(x) 40 | 41 | 42 | class CavAttention(nn.Module): 43 | """ 44 | Vanilla CAV attention. 45 | """ 46 | def __init__(self, dim, heads, dim_head=64, dropout=0.1): 47 | super().__init__() 48 | inner_dim = heads * dim_head 49 | 50 | self.heads = heads 51 | self.scale = dim_head ** -0.5 52 | 53 | self.attend = nn.Softmax(dim=-1) 54 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) 55 | 56 | self.to_out = nn.Sequential( 57 | nn.Linear(inner_dim, dim), 58 | nn.Dropout(dropout) 59 | ) 60 | 61 | def forward(self, x, mask, prior_encoding): 62 | # x: (B, L, H, W, C) -> (B, H, W, L, C) 63 | # mask: (B, L) 64 | x = x.permute(0, 2, 3, 1, 4) 65 | # mask: (B, 1, H, W, L, 1) 66 | mask = mask.unsqueeze(1) 67 | 68 | # qkv: [(B, H, W, L, C_inner) *3] 69 | qkv = self.to_qkv(x).chunk(3, dim=-1) 70 | # q: (B, M, H, W, L, C) 71 | q, k, v = map(lambda t: rearrange(t, 'b h w l (m c) -> b m h w l c', 72 | m=self.heads), qkv) 73 | 74 | # attention, (B, M, H, W, L, L) 75 | att_map = torch.einsum('b m h w i c, b m h w j c -> b m h w i j', 76 | q, k) * self.scale 77 | # add mask 78 | att_map = att_map.masked_fill(mask == 0, -float('inf')) 79 | # softmax 80 | att_map = self.attend(att_map) 81 | 82 | # out:(B, M, H, W, L, C_head) 83 | out = torch.einsum('b m h w i j, b m h w j c -> b m h w i c', att_map, 84 | v) 85 | out = rearrange(out, 'b m h w l c -> b h w l (m c)', 86 | m=self.heads) 87 | out = self.to_out(out) 88 | # (B L H W C) 89 | out = out.permute(0, 3, 1, 2, 4) 90 | return out 91 | 92 | 93 | class BaseEncoder(nn.Module): 94 | def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.): 95 | super().__init__() 96 | self.layers = nn.ModuleList([]) 97 | for _ in range(depth): 98 | self.layers.append(nn.ModuleList([ 99 | PreNorm(dim, CavAttention(dim, 100 | heads=heads, 101 | dim_head=dim_head, 102 | dropout=dropout)), 103 | PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout)) 104 | ])) 105 | 106 | def forward(self, x, mask): 107 | for attn, ff in self.layers: 108 | x = attn(x, mask=mask) + x 109 | x = ff(x) + x 110 | return x 111 | 112 | 113 | class BaseTransformer(nn.Module): 114 | def __init__(self, args): 115 | super().__init__() 116 | 117 | dim = args['dim'] 118 | depth = args['depth'] 119 | heads = args['heads'] 120 | dim_head = args['dim_head'] 121 | mlp_dim = args['mlp_dim'] 122 | dropout = args['dropout'] 123 | max_cav = args['max_cav'] 124 | 125 | self.encoder = BaseEncoder(dim, depth, heads, dim_head, mlp_dim, 126 | dropout) 127 | 128 | def forward(self, x, mask): 129 | # B, L, H, W, C 130 | output = self.encoder(x, mask) 131 | # B, H, W, C 132 | output = output[:, 0] 133 | 134 | return output -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_opv2v.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 4 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 5 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 6 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 7 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 8 | from v2xvit.models.sub_modules.self_attn import AttFusion 9 | 10 | 11 | class PointPillarOPV2V(nn.Module): 12 | def __init__(self, args): 13 | super(PointPillarOPV2V, self).__init__() 14 | 15 | self.max_cav = 5 16 | # PIllar VFE 17 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 18 | num_point_features=4, 19 | voxel_size=args['voxel_size'], 20 | point_cloud_range=args['lidar_range']) 21 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 22 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 23 | # used to downsample the feature map for efficient computation 24 | self.shrink_flag = False 25 | if 'shrink_header' in args: 26 | self.shrink_flag = True 27 | self.shrink_conv = DownsampleConv(args['shrink_header']) 28 | self.compression = False 29 | 30 | if args['compression'] > 0: 31 | self.compression = True 32 | self.naive_compressor = NaiveCompressor(256, args['compression']) 33 | 34 | self.fusion_net = AttFusion(256) 35 | 36 | self.cls_head = nn.Conv2d(192 * 2, args['anchor_number'], 37 | kernel_size=1) 38 | self.reg_head = nn.Conv2d(192 * 2, 7 * args['anchor_number'], 39 | kernel_size=1) 40 | 41 | if args['backbone_fix']: 42 | self.backbone_fix() 43 | 44 | def backbone_fix(self): 45 | """ 46 | Fix the parameters of backbone during finetune on timedelay。 47 | """ 48 | for p in self.pillar_vfe.parameters(): 49 | p.requires_grad = False 50 | 51 | for p in self.scatter.parameters(): 52 | p.requires_grad = False 53 | 54 | for p in self.backbone.parameters(): 55 | p.requires_grad = False 56 | 57 | if self.compression: 58 | for p in self.naive_compressor.parameters(): 59 | p.requires_grad = False 60 | if self.shrink_flag: 61 | for p in self.shrink_conv.parameters(): 62 | p.requires_grad = False 63 | 64 | for p in self.cls_head.parameters(): 65 | p.requires_grad = False 66 | for p in self.reg_head.parameters(): 67 | p.requires_grad = False 68 | 69 | def forward(self, data_dict): 70 | voxel_features = data_dict['processed_lidar']['voxel_features'] 71 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 72 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 73 | record_len = data_dict['record_len'] 74 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 75 | 76 | # B, max_cav, 3(dt dv infra), 1, 1 77 | prior_encoding =\ 78 | data_dict['prior_encoding'].unsqueeze(-1).unsqueeze(-1) 79 | 80 | batch_dict = {'voxel_features': voxel_features, 81 | 'voxel_coords': voxel_coords, 82 | 'voxel_num_points': voxel_num_points, 83 | 'record_len': record_len} 84 | # n, 4 -> n, c 85 | batch_dict = self.pillar_vfe(batch_dict) 86 | # n, c -> N, C, H, W 87 | batch_dict = self.scatter(batch_dict) 88 | batch_dict = self.backbone(batch_dict) 89 | 90 | spatial_features_2d = batch_dict['spatial_features_2d'] 91 | # downsample feature to reduce memory 92 | if self.shrink_flag: 93 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 94 | # compressor 95 | if self.compression: 96 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 97 | 98 | fused_feature = self.fusion_net(spatial_features_2d, record_len) 99 | 100 | psm = self.cls_head(fused_feature) 101 | rm = self.reg_head(fused_feature) 102 | 103 | output_dict = {'psm': psm, 104 | 'rm': rm} 105 | 106 | return output_dict 107 | -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/where2comm_transformer_multiscale_resnet.yaml: -------------------------------------------------------------------------------- 1 | name: opv2v_where2comm_transformer_multiscale_resnet 2 | 3 | root_dir: '/data/gjm/OPV2V/train' # '/data2/gjm/v2xset/train' 4 | validate_dir: '/data/gjm/OPV2V/validate' # '/data2/gjm/v2xset/validate' 5 | 6 | wild_setting: 7 | async: False 8 | async_overhead: 100 9 | seed: 20 10 | loc_err: false 11 | xyz_std: 0.2 12 | ryp_std: 0.2 13 | data_size: 1.06 # Mb!! 14 | transmission_speed: 27 # Mbps!! 15 | backbone_delay: 10 # ms 16 | 17 | noise_setting: 18 | add_noise: false 19 | args: 20 | pos_std: 1 21 | rot_std: 0 22 | pos_mean: 0 23 | rot_mean: 0 24 | 25 | yaml_parser: "load_point_pillar_params" 26 | train_params: 27 | batch_size: &batch_size 4 28 | epoches: 100 29 | eval_freq: 2 30 | save_freq: 2 31 | max_cav: &max_cav 5 32 | 33 | 34 | fusion: 35 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 36 | args: 37 | cur_ego_pose_flag: true 38 | # preprocess-related 39 | preprocess: 40 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 41 | core_method: 'SpVoxelPreprocessor' 42 | args: 43 | voxel_size: &voxel_size [0.4, 0.4, 4] 44 | max_points_per_voxel: 32 45 | max_voxel_train: 32000 46 | max_voxel_test: 70000 47 | # lidar range for each individual cav. Format: xyzxyz minmax 48 | cav_lidar_range: &cav_lidar [-140.8, -40, -3, 140.8, 40, 1] 49 | 50 | data_augment: 51 | - NAME: random_world_flip 52 | ALONG_AXIS_LIST: [ 'x' ] 53 | 54 | - NAME: random_world_rotation 55 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 56 | 57 | - NAME: random_world_scaling 58 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 59 | 60 | # anchor box related 61 | postprocess: 62 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 63 | gt_range: *cav_lidar 64 | anchor_args: 65 | cav_lidar_range: *cav_lidar 66 | l: 3.9 67 | w: 1.6 68 | h: 1.56 69 | r: [0, 90] 70 | feature_stride: 2 71 | num: &achor_num 2 72 | target_args: 73 | pos_threshold: 0.6 74 | neg_threshold: 0.45 75 | score_threshold: 0.25 76 | order: 'hwl' # hwl or lwh 77 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 78 | nms_thresh: 0.15 79 | 80 | # model related 81 | model: 82 | core_method: point_pillar_where2comm 83 | 84 | args: 85 | communication: 86 | thre: 0.01 87 | gaussian_smooth: 88 | k_size: 5 89 | c_sigma: 1.0 90 | voxel_size: *voxel_size 91 | lidar_range: *cav_lidar 92 | anchor_number: *achor_num 93 | max_cav: *max_cav 94 | compression: 0 # compression rate 95 | backbone_fix: false 96 | 97 | pillar_vfe: 98 | use_norm: true 99 | with_distance: false 100 | use_absolute_xyz: true 101 | num_filters: [64] 102 | point_pillar_scatter: 103 | num_features: 64 104 | 105 | base_bev_backbone: 106 | resnet: True 107 | layer_nums: &layer_nums [3, 4, 5] 108 | layer_strides: [2, 2, 2] 109 | num_filters: &num_filters [64, 128, 256] 110 | upsample_strides: [1, 2, 4] 111 | num_upsample_filter: [128, 128, 128] 112 | compression: 0 113 | voxel_size: *voxel_size 114 | shrink_header: 115 | kernal_size: [ 3 ] 116 | stride: [ 1 ] 117 | padding: [ 1 ] 118 | dim: [ 256 ] 119 | input_dim: 384 # 128 * 3 120 | 121 | # dcn: 122 | # in_channels: [384] 123 | # out_channels: [256] 124 | # stride: [1] 125 | # padding: [1] 126 | # kernel_size : [3] 127 | # n_blocks: 1 128 | 129 | fusion_args: 130 | voxel_size: *voxel_size 131 | downsample_rate: 1 132 | in_channels: 256 133 | n_head: 8 134 | dropout_rate: 0 135 | only_attention: true 136 | agg_operator: 137 | # mode: 'SF' 138 | mode: 'ATTEN' 139 | # mode: 'Transformer' 140 | feature_dim: 256 141 | n_head: 8 142 | with_spe: false 143 | with_scm: false 144 | multi_scale: true 145 | layer_nums: *layer_nums 146 | num_filters: *num_filters 147 | batch_size: *batch_size 148 | 149 | loss: 150 | core_method: point_pillar_loss 151 | args: 152 | cls_weight: 1.0 153 | reg: 2.0 154 | 155 | optimizer: 156 | core_method: Adam 157 | lr: 0.001 158 | args: 159 | eps: 1e-10 160 | weight_decay: 1e-4 161 | 162 | lr_scheduler: 163 | core_method: multistep #step, multistep and Exponential support 164 | gamma: 0.1 165 | step_size: [10, 20] 166 | 167 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_single.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 4 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 5 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 6 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 7 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 8 | from v2xvit.models.sub_modules.f_cooper_fuse import SpatialFusion 9 | import torch 10 | 11 | 12 | 13 | class PointPillarSingle(nn.Module): 14 | def __init__(self, args): 15 | super(PointPillarSingle, self).__init__() 16 | 17 | self.max_cav = args['max_cav'] 18 | # PIllar VFE 19 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 20 | num_point_features=4, 21 | voxel_size=args['voxel_size'], 22 | point_cloud_range=args['lidar_range']) 23 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 24 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 25 | # used to downsample the feature map for efficient computation 26 | self.shrink_flag = False 27 | if 'shrink_header' in args: 28 | self.shrink_flag = True 29 | self.shrink_conv = DownsampleConv(args['shrink_header']) 30 | self.compression = False 31 | 32 | if args['compression'] > 0: 33 | self.compression = True 34 | self.naive_compressor = NaiveCompressor(256, args['compression']) 35 | 36 | self.fusion_net = SpatialFusion() 37 | 38 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 39 | kernel_size=1) 40 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 41 | kernel_size=1) 42 | 43 | if args['backbone_fix']: 44 | self.backbone_fix() 45 | 46 | def backbone_fix(self): 47 | """ 48 | Fix the parameters of backbone during finetune on timedelay。 49 | """ 50 | for p in self.pillar_vfe.parameters(): 51 | p.requires_grad = False 52 | 53 | for p in self.scatter.parameters(): 54 | p.requires_grad = False 55 | 56 | for p in self.backbone.parameters(): 57 | p.requires_grad = False 58 | 59 | if self.compression: 60 | for p in self.naive_compressor.parameters(): 61 | p.requires_grad = False 62 | if self.shrink_flag: 63 | for p in self.shrink_conv.parameters(): 64 | p.requires_grad = False 65 | 66 | for p in self.cls_head.parameters(): 67 | p.requires_grad = False 68 | for p in self.reg_head.parameters(): 69 | p.requires_grad = False 70 | 71 | def regroup(self, x, record_len): 72 | cum_sum_len = torch.cumsum(record_len, dim=0) 73 | split_x = torch.tensor_split(x, cum_sum_len[:-1].cpu()) 74 | return split_x 75 | 76 | def forward(self, data_dict): 77 | voxel_features = data_dict['processed_lidar']['voxel_features'] 78 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 79 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 80 | record_len = data_dict['record_len'] 81 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 82 | 83 | batch_dict = {'voxel_features': voxel_features, 84 | 'voxel_coords': voxel_coords, 85 | 'voxel_num_points': voxel_num_points, 86 | 'record_len': record_len} 87 | # n, 4 -> n, c 88 | batch_dict = self.pillar_vfe(batch_dict) 89 | # n, c -> N, C, H, W 90 | batch_dict = self.scatter(batch_dict) 91 | batch_dict = self.backbone(batch_dict) 92 | 93 | spatial_features_2d = batch_dict['spatial_features_2d'] 94 | # print(spatial_features_2d.shape) 95 | # downsample feature to reduce memory 96 | if self.shrink_flag: 97 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 98 | # # compressor 99 | if self.compression: 100 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 101 | 102 | 103 | split_x = self.regroup(spatial_features_2d, record_len) 104 | out = [] 105 | 106 | for xx in split_x: 107 | out.append(xx[0].unsqueeze(0)) 108 | fused_feature = torch.cat(out, dim=0) 109 | # print(fused_feature.shape) 110 | # exit() 111 | psm = self.cls_head(fused_feature) 112 | rm = self.reg_head(fused_feature) 113 | 114 | output_dict = {'psm': psm, 115 | 'rm': rm} 116 | 117 | return output_dict 118 | -------------------------------------------------------------------------------- /v2xvit/utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from v2xvit.utils import common_utils 7 | from v2xvit.hypes_yaml import yaml_utils 8 | 9 | 10 | def voc_ap(rec, prec): 11 | """ 12 | VOC 2010 Average Precision. 13 | """ 14 | rec.insert(0, 0.0) 15 | rec.append(1.0) 16 | mrec = rec[:] 17 | 18 | prec.insert(0, 0.0) 19 | prec.append(0.0) 20 | mpre = prec[:] 21 | 22 | for i in range(len(mpre) - 2, -1, -1): 23 | mpre[i] = max(mpre[i], mpre[i + 1]) 24 | 25 | i_list = [] 26 | for i in range(1, len(mrec)): 27 | if mrec[i] != mrec[i - 1]: 28 | i_list.append(i) 29 | 30 | ap = 0.0 31 | for i in i_list: 32 | ap += ((mrec[i] - mrec[i - 1]) * mpre[i]) 33 | return ap, mrec, mpre 34 | 35 | 36 | def caluclate_tp_fp(det_boxes, det_score, gt_boxes, result_stat, iou_thresh): 37 | """ 38 | Calculate the true positive and false positive numbers of the current 39 | frames. 40 | 41 | Parameters 42 | ---------- 43 | det_boxes : torch.Tensor 44 | The detection bounding box, shape (N, 8, 3) or (N, 4, 2). 45 | det_score :torch.Tensor 46 | The confidence score for each preditect bounding box. 47 | gt_boxes : torch.Tensor 48 | The groundtruth bounding box. 49 | result_stat: dict 50 | A dictionary contains fp, tp and gt number. 51 | iou_thresh : float 52 | The iou thresh. 53 | """ 54 | # fp, tp and gt in the current frame 55 | fp = [] 56 | tp = [] 57 | gt = gt_boxes.shape[0] 58 | if det_boxes is not None: 59 | # convert bounding boxes to numpy array 60 | det_boxes = common_utils.torch_tensor_to_numpy(det_boxes) 61 | det_score = common_utils.torch_tensor_to_numpy(det_score) 62 | gt_boxes = common_utils.torch_tensor_to_numpy(gt_boxes) 63 | 64 | # sort the prediction bounding box by score 65 | score_order_descend = np.argsort(-det_score) 66 | det_polygon_list = list(common_utils.convert_format(det_boxes)) 67 | gt_polygon_list = list(common_utils.convert_format(gt_boxes)) 68 | 69 | # match prediction and gt bounding box 70 | for i in range(score_order_descend.shape[0]): 71 | det_polygon = det_polygon_list[score_order_descend[i]] 72 | ious = common_utils.compute_iou(det_polygon, gt_polygon_list) 73 | 74 | if len(gt_polygon_list) == 0 or np.max(ious) < iou_thresh: 75 | fp.append(1) 76 | tp.append(0) 77 | continue 78 | 79 | fp.append(0) 80 | tp.append(1) 81 | 82 | gt_index = np.argmax(ious) 83 | gt_polygon_list.pop(gt_index) 84 | 85 | result_stat[iou_thresh]['fp'] += fp 86 | result_stat[iou_thresh]['tp'] += tp 87 | result_stat[iou_thresh]['gt'] += gt 88 | 89 | 90 | def calculate_ap(result_stat, iou): 91 | """ 92 | Calculate the average precision and recall, and save them into a txt. 93 | 94 | Parameters 95 | ---------- 96 | result_stat : dict 97 | A dictionary contains fp, tp and gt number. 98 | iou : float 99 | """ 100 | iou_5 = result_stat[iou] 101 | 102 | fp = iou_5['fp'] 103 | tp = iou_5['tp'] 104 | assert len(fp) == len(tp) 105 | 106 | gt_total = iou_5['gt'] 107 | 108 | cumsum = 0 109 | for idx, val in enumerate(fp): 110 | fp[idx] += cumsum 111 | cumsum += val 112 | 113 | cumsum = 0 114 | for idx, val in enumerate(tp): 115 | tp[idx] += cumsum 116 | cumsum += val 117 | 118 | rec = tp[:] 119 | for idx, val in enumerate(tp): 120 | rec[idx] = float(tp[idx]) / gt_total 121 | 122 | prec = tp[:] 123 | for idx, val in enumerate(tp): 124 | prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) 125 | 126 | ap, mrec, mprec = voc_ap(rec[:], prec[:]) 127 | 128 | return ap, mrec, mprec 129 | 130 | 131 | def eval_final_results(result_stat, save_path): 132 | dump_dict = {} 133 | 134 | ap_30, mrec_30, mpre_30 = calculate_ap(result_stat, 0.30) 135 | ap_50, mrec_50, mpre_50 = calculate_ap(result_stat, 0.50) 136 | ap_70, mrec_70, mpre_70 = calculate_ap(result_stat, 0.70) 137 | 138 | dump_dict.update({'ap30': ap_30, 139 | 'ap_50': ap_50, 140 | 'ap_70': ap_70, 141 | 'mpre_50': mpre_50, 142 | 'mrec_50': mrec_50, 143 | 'mpre_70': mpre_70, 144 | 'mrec_70': mrec_70, 145 | }) 146 | yaml_utils.save_yaml(dump_dict, os.path.join(save_path, 'eval.yaml')) 147 | 148 | print('The Average Precision at IOU 0.3 is %.2f, ' 149 | 'The Average Precision at IOU 0.5 is %.2f, ' 150 | 'The Average Precision at IOU 0.7 is %.2f' % (ap_30, ap_50, ap_70)) 151 | return ap_30, ap_50, ap_70 152 | -------------------------------------------------------------------------------- /v2xvit/models/sub_modules/mswin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multi-scale window transformer 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | 8 | from einops import rearrange 9 | from v2xvit.models.sub_modules.split_attn import SplitAttn 10 | 11 | 12 | def get_relative_distances(window_size): 13 | indices = torch.tensor(np.array( 14 | [[x, y] for x in range(window_size) for y in range(window_size)])) 15 | distances = indices[None, :, :] - indices[:, None, :] 16 | return distances 17 | 18 | 19 | class BaseWindowAttention(nn.Module): 20 | def __init__(self, dim, heads, dim_head, drop_out, window_size, 21 | relative_pos_embedding): 22 | super().__init__() 23 | inner_dim = dim_head * heads 24 | 25 | self.heads = heads 26 | self.scale = dim_head ** -0.5 27 | self.window_size = window_size 28 | self.relative_pos_embedding = relative_pos_embedding 29 | 30 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) 31 | 32 | if self.relative_pos_embedding: 33 | self.relative_indices = get_relative_distances(window_size) + \ 34 | window_size - 1 35 | self.pos_embedding = nn.Parameter(torch.randn(2 * window_size - 1, 36 | 2 * window_size - 1)) 37 | else: 38 | self.pos_embedding = nn.Parameter(torch.randn(window_size ** 2, 39 | window_size ** 2)) 40 | 41 | self.to_out = nn.Sequential( 42 | nn.Linear(inner_dim, dim), 43 | nn.Dropout(drop_out) 44 | ) 45 | 46 | def forward(self, x): 47 | b, l, h, w, c, m = *x.shape, self.heads 48 | 49 | qkv = self.to_qkv(x).chunk(3, dim=-1) 50 | new_h = h // self.window_size 51 | new_w = w // self.window_size 52 | 53 | # q : (b, l, m, new_h*new_w, window_size^2, c_head) 54 | q, k, v = map( 55 | lambda t: rearrange(t, 56 | 'b l (new_h w_h) (new_w w_w) (m c) -> b l m (new_h new_w) (w_h w_w) c', 57 | m=m, w_h=self.window_size, 58 | w_w=self.window_size), qkv) 59 | # b l m h window_size window_size 60 | dots = torch.einsum('b l m h i c, b l m h j c -> b l m h i j', 61 | q, k, ) * self.scale 62 | # consider prior knowledge of the local window 63 | if self.relative_pos_embedding: 64 | dots += self.pos_embedding[self.relative_indices[:, :, 0], 65 | self.relative_indices[:, :, 1]] 66 | else: 67 | dots += self.pos_embedding 68 | 69 | attn = dots.softmax(dim=-1) 70 | 71 | out = torch.einsum('b l m h i j, b l m h j c -> b l m h i c', attn, v) 72 | # b l h w c 73 | out = rearrange(out, 74 | 'b l m (new_h new_w) (w_h w_w) c -> b l (new_h w_h) (new_w w_w) (m c)', 75 | m=self.heads, w_h=self.window_size, 76 | w_w=self.window_size, 77 | new_w=new_w, new_h=new_h) 78 | out = self.to_out(out) 79 | 80 | return out 81 | 82 | 83 | class PyramidWindowAttention(nn.Module): 84 | def __init__(self, dim, heads, dim_heads, drop_out, window_size, 85 | relative_pos_embedding, fuse_method='naive'): 86 | super().__init__() 87 | 88 | assert isinstance(window_size, list) 89 | assert isinstance(heads, list) 90 | assert isinstance(dim_heads, list) 91 | assert len(dim_heads) == len(heads) 92 | 93 | self.pwmsa = nn.ModuleList([]) 94 | 95 | for (head, dim_head, ws) in zip(heads, dim_heads, window_size): 96 | self.pwmsa.append(BaseWindowAttention(dim, 97 | head, 98 | dim_head, 99 | drop_out, 100 | ws, 101 | relative_pos_embedding)) 102 | self.fuse_mehod = fuse_method 103 | if fuse_method == 'split_attn': 104 | self.split_attn = SplitAttn(256) 105 | 106 | def forward(self, x): 107 | output = None 108 | # naive fusion will just sum up all window attention output and do a 109 | # mean 110 | if self.fuse_mehod == 'naive': 111 | for wmsa in self.pwmsa: 112 | output = wmsa(x) if output is None else output + wmsa(x) 113 | return output / len(self.pwmsa) 114 | 115 | elif self.fuse_mehod == 'split_attn': 116 | window_list = [] 117 | for wmsa in self.pwmsa: 118 | window_list.append(wmsa(x)) 119 | return self.split_attn(window_list) -------------------------------------------------------------------------------- /v2xvit/models/fuse_modules/mswin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multi-scale window transformer 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | 8 | from einops import rearrange 9 | from opencood.models.sub_modules.split_attn import SplitAttn 10 | 11 | 12 | def get_relative_distances(window_size): 13 | indices = torch.tensor(np.array( 14 | [[x, y] for x in range(window_size) for y in range(window_size)])) 15 | distances = indices[None, :, :] - indices[:, None, :] 16 | return distances 17 | 18 | 19 | class BaseWindowAttention(nn.Module): 20 | def __init__(self, dim, heads, dim_head, drop_out, window_size, 21 | relative_pos_embedding): 22 | super().__init__() 23 | inner_dim = dim_head * heads 24 | 25 | self.heads = heads 26 | self.scale = dim_head ** -0.5 27 | self.window_size = window_size 28 | self.relative_pos_embedding = relative_pos_embedding 29 | 30 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) 31 | 32 | if self.relative_pos_embedding: 33 | self.relative_indices = get_relative_distances(window_size) + \ 34 | window_size - 1 35 | self.pos_embedding = nn.Parameter(torch.randn(2 * window_size - 1, 36 | 2 * window_size - 1)) 37 | else: 38 | self.pos_embedding = nn.Parameter(torch.randn(window_size ** 2, 39 | window_size ** 2)) 40 | 41 | self.to_out = nn.Sequential( 42 | nn.Linear(inner_dim, dim), 43 | nn.Dropout(drop_out) 44 | ) 45 | 46 | def forward(self, x): 47 | b, l, h, w, c, m = *x.shape, self.heads 48 | 49 | qkv = self.to_qkv(x).chunk(3, dim=-1) 50 | new_h = h // self.window_size 51 | new_w = w // self.window_size 52 | 53 | # q : (b, l, m, new_h*new_w, window_size^2, c_head) 54 | q, k, v = map( 55 | lambda t: rearrange(t, 56 | 'b l (new_h w_h) (new_w w_w) (m c) -> b l m (new_h new_w) (w_h w_w) c', 57 | m=m, w_h=self.window_size, 58 | w_w=self.window_size), qkv) 59 | # b l m h window_size window_size 60 | dots = torch.einsum('b l m h i c, b l m h j c -> b l m h i j', 61 | q, k, ) * self.scale 62 | # consider prior knowledge of the local window 63 | if self.relative_pos_embedding: 64 | dots += self.pos_embedding[self.relative_indices[:, :, 0], 65 | self.relative_indices[:, :, 1]] 66 | else: 67 | dots += self.pos_embedding 68 | 69 | attn = dots.softmax(dim=-1) 70 | 71 | out = torch.einsum('b l m h i j, b l m h j c -> b l m h i c', attn, v) 72 | # b l h w c 73 | out = rearrange(out, 74 | 'b l m (new_h new_w) (w_h w_w) c -> b l (new_h w_h) (new_w w_w) (m c)', 75 | m=self.heads, w_h=self.window_size, 76 | w_w=self.window_size, 77 | new_w=new_w, new_h=new_h) 78 | out = self.to_out(out) 79 | 80 | return out 81 | 82 | 83 | class PyramidWindowAttention(nn.Module): 84 | def __init__(self, dim, heads, dim_heads, drop_out, window_size, 85 | relative_pos_embedding, fuse_method='naive'): 86 | super().__init__() 87 | 88 | assert isinstance(window_size, list) 89 | assert isinstance(heads, list) 90 | assert isinstance(dim_heads, list) 91 | assert len(dim_heads) == len(heads) 92 | 93 | self.pwmsa = nn.ModuleList([]) 94 | 95 | for (head, dim_head, ws) in zip(heads, dim_heads, window_size): 96 | self.pwmsa.append(BaseWindowAttention(dim, 97 | head, 98 | dim_head, 99 | drop_out, 100 | ws, 101 | relative_pos_embedding)) 102 | self.fuse_mehod = fuse_method 103 | if fuse_method == 'split_attn': 104 | self.split_attn = SplitAttn(256) 105 | 106 | def forward(self, x): 107 | output = None 108 | # naive fusion will just sum up all window attention output and do a 109 | # mean 110 | if self.fuse_mehod == 'naive': 111 | for wmsa in self.pwmsa: 112 | output = wmsa(x) if output is None else output + wmsa(x) 113 | return output / len(self.pwmsa) 114 | 115 | elif self.fuse_mehod == 'split_attn': 116 | window_list = [] 117 | for wmsa in self.pwmsa: 118 | window_list.append(wmsa(x)) 119 | return self.split_attn(window_list) -------------------------------------------------------------------------------- /v2xvit/hypes_yaml/point_pillar_v2xvit.yaml: -------------------------------------------------------------------------------- 1 | name: point_pillar_v2xvit 2 | root_dir: '/data/opv2v/train' 3 | validate_dir: '/data/opv2v/validate' 4 | 5 | wild_setting: 6 | async: false 7 | async_mode: 'sim' 8 | async_overhead: 100 9 | seed: 25 10 | loc_err: false 11 | xyz_std: 0.2 12 | ryp_std: 0.2 13 | data_size: 1.06 # Mb!! 14 | transmission_speed: 27 # Mbps!! 15 | backbone_delay: 10 # ms 16 | 17 | yaml_parser: "load_point_pillar_params" 18 | train_params: 19 | batch_size: &batch_size 2 20 | epoches: 60 21 | eval_freq: 1 22 | save_freq: 1 23 | max_cav: &max_cav 5 24 | 25 | fusion: 26 | core_method: 'IntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported 27 | args: 28 | cur_ego_pose_flag: False 29 | # when the cur_ego_pose_flag is set to True, there is no time gap 30 | # between the time when the LiDAR data is captured by connected 31 | # agents and when the extracted features are received by 32 | # the ego vehicle, which is equal to implement STCM. When set to False, 33 | # STCM has to be used. 34 | 35 | # preprocess-related 36 | preprocess: 37 | # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor 38 | core_method: 'SpVoxelPreprocessor' 39 | args: 40 | voxel_size: &voxel_size [0.4, 0.4, 4] 41 | max_points_per_voxel: 32 42 | max_voxel_train: 32000 43 | max_voxel_test: 70000 44 | # lidar range for each individual cav. 45 | cav_lidar_range: &cav_lidar [-140.8, -38.4, -3, 140.8, 38.4, 1] 46 | 47 | data_augment: 48 | - NAME: random_world_flip 49 | ALONG_AXIS_LIST: [ 'x' ] 50 | 51 | - NAME: random_world_rotation 52 | WORLD_ROT_ANGLE: [ -0.78539816, 0.78539816 ] 53 | 54 | - NAME: random_world_scaling 55 | WORLD_SCALE_RANGE: [ 0.95, 1.05 ] 56 | 57 | # anchor box related 58 | postprocess: 59 | core_method: 'VoxelPostprocessor' # VoxelPostprocessor, BevPostprocessor supported 60 | anchor_args: 61 | cav_lidar_range: *cav_lidar 62 | l: 3.9 63 | w: 1.6 64 | h: 1.56 65 | r: [0, 90] 66 | feature_stride: 4 67 | num: &achor_num 2 68 | target_args: 69 | pos_threshold: 0.6 70 | neg_threshold: 0.45 71 | score_threshold: 0.27 72 | order: 'hwl' # hwl or lwh 73 | max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch 74 | nms_thresh: 0.15 75 | 76 | # model related 77 | model: 78 | core_method: point_pillar_transformer 79 | args: 80 | voxel_size: *voxel_size 81 | lidar_range: *cav_lidar 82 | anchor_number: *achor_num 83 | max_cav: *max_cav 84 | compression: 0 # compression rate 85 | backbone_fix: false 86 | 87 | pillar_vfe: 88 | use_norm: true 89 | with_distance: false 90 | use_absolute_xyz: true 91 | num_filters: [64] 92 | point_pillar_scatter: 93 | num_features: 64 94 | 95 | base_bev_backbone: 96 | layer_nums: [3, 5, 8] 97 | layer_strides: [2, 2, 2] 98 | num_filters: [64, 128, 256] 99 | upsample_strides: [1, 2, 4] 100 | num_upsample_filter: [128, 128, 128] 101 | shrink_header: 102 | kernal_size: [3] 103 | stride: [2] 104 | padding: [1] 105 | dim: [256] 106 | input_dim: 384 # 128 * 3 107 | 108 | transformer: 109 | encoder: &encoder 110 | # number of fusion blocks per encoder layer 111 | num_blocks: 1 112 | # number of encoder layers 113 | depth: 3 114 | use_roi_mask: true 115 | use_RTE: &use_RTE true 116 | RTE_ratio: &RTE_ratio 2 # 2 means the dt has 100ms interval while 1 means 50 ms interval 117 | # agent-wise attention 118 | cav_att_config: &cav_att_config 119 | dim: 256 120 | use_hetero: true 121 | use_RTE: *use_RTE 122 | RTE_ratio: *RTE_ratio 123 | heads: 8 124 | dim_head: 32 125 | dropout: 0.3 126 | # spatial-wise attention 127 | pwindow_att_config: &pwindow_att_config 128 | dim: 256 129 | heads: [16, 8, 4] 130 | dim_head: [16, 32, 64] 131 | dropout: 0.3 132 | window_size: [4, 8, 16] 133 | relative_pos_embedding: true 134 | fusion_method: 'split_attn' 135 | # feedforward condition 136 | feed_forward: &feed_forward 137 | mlp_dim: 256 138 | dropout: 0.3 139 | sttf: &sttf 140 | voxel_size: *voxel_size 141 | downsample_rate: 4 142 | 143 | # add decoder later 144 | 145 | loss: 146 | core_method: point_pillar_loss 147 | args: 148 | cls_weight: 1.0 149 | reg: 2.0 150 | 151 | optimizer: 152 | core_method: Adam 153 | lr: 0.001 154 | args: 155 | eps: 1e-10 156 | weight_decay: 1e-4 157 | 158 | lr_scheduler: 159 | core_method: multistep #step, multistep and Exponential support 160 | gamma: 0.1 161 | step_size: [15, 50] 162 | 163 | -------------------------------------------------------------------------------- /v2xvit/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utilities 3 | """ 4 | 5 | import numpy as np 6 | import torch 7 | from shapely.geometry import Polygon 8 | 9 | 10 | def check_numpy_to_torch(x): 11 | if isinstance(x, np.ndarray): 12 | return torch.from_numpy(x).float(), True 13 | return x, False 14 | 15 | 16 | def check_contain_nan(x): 17 | if isinstance(x, dict): 18 | return any(check_contain_nan(v) for k, v in x.items()) 19 | if isinstance(x, list): 20 | return any(check_contain_nan(itm) for itm in x) 21 | if isinstance(x, int) or isinstance(x, float): 22 | return False 23 | if isinstance(x, np.ndarray): 24 | return np.any(np.isnan(x)) 25 | return torch.any(x.isnan()).detach().cpu().item() 26 | 27 | 28 | def rotate_points_along_z(points, angle): 29 | """ 30 | Args: 31 | points: (B, N, 3 + C) 32 | angle: (B), radians, angle along z-axis, angle increases x ==> y 33 | Returns: 34 | 35 | """ 36 | points, is_numpy = check_numpy_to_torch(points) 37 | angle, _ = check_numpy_to_torch(angle) 38 | 39 | cosa = torch.cos(angle) 40 | sina = torch.sin(angle) 41 | zeros = angle.new_zeros(points.shape[0]) 42 | ones = angle.new_ones(points.shape[0]) 43 | rot_matrix = torch.stack(( 44 | cosa, sina, zeros, 45 | -sina, cosa, zeros, 46 | zeros, zeros, ones 47 | ), dim=1).view(-1, 3, 3).float() 48 | points_rot = torch.matmul(points[:, :, 0:3].float(), rot_matrix) 49 | points_rot = torch.cat((points_rot, points[:, :, 3:]), dim=-1) 50 | return points_rot.numpy() if is_numpy else points_rot 51 | 52 | 53 | def rotate_points_along_z_2d(points, angle): 54 | """ 55 | Rorate the points along z-axis. 56 | Parameters 57 | ---------- 58 | points : torch.Tensor / np.ndarray 59 | (N, 2). 60 | angle : torch.Tensor / np.ndarray 61 | (N,) 62 | 63 | Returns 64 | ------- 65 | points_rot : torch.Tensor / np.ndarray 66 | Rorated points with shape (N, 2) 67 | 68 | """ 69 | points, is_numpy = check_numpy_to_torch(points) 70 | angle, _ = check_numpy_to_torch(angle) 71 | cosa = torch.cos(angle) 72 | sina = torch.sin(angle) 73 | # (N, 2, 2) 74 | rot_matrix = torch.stack((cosa, sina, -sina, cosa), dim=1).view(-1, 2, 75 | 2).float() 76 | points_rot = torch.einsum("ik, ikj->ij", points.float(), rot_matrix) 77 | return points_rot.numpy() if is_numpy else points_rot 78 | 79 | 80 | def remove_ego_from_objects(objects, ego_id): 81 | """ 82 | Avoid adding ego vehicle to the object dictionary. 83 | 84 | Parameters 85 | ---------- 86 | objects : dict 87 | The dictionary contained all objects. 88 | 89 | ego_id : int 90 | Ego id. 91 | """ 92 | if ego_id in objects: 93 | del objects[ego_id] 94 | 95 | 96 | def retrieve_ego_id(base_data_dict): 97 | """ 98 | Retrieve the ego vehicle id from sample(origin format). 99 | 100 | Parameters 101 | ---------- 102 | base_data_dict : dict 103 | Data sample in origin format. 104 | 105 | Returns 106 | ------- 107 | ego_id : str 108 | The id of ego vehicle. 109 | """ 110 | ego_id = None 111 | 112 | for cav_id, cav_content in base_data_dict.items(): 113 | if cav_content['ego']: 114 | ego_id = cav_id 115 | break 116 | return ego_id 117 | 118 | 119 | def compute_iou(box, boxes): 120 | """ 121 | Compute iou between box and boxes list 122 | Parameters 123 | ---------- 124 | box : shapely.geometry.Polygon 125 | Bounding box Polygon. 126 | 127 | boxes : list 128 | List of shapely.geometry.Polygon. 129 | 130 | Returns 131 | ------- 132 | iou : np.ndarray 133 | Array of iou between box and boxes. 134 | 135 | """ 136 | # Calculate intersection areas 137 | iou = [box.intersection(b).area / box.union(b).area for b in boxes] 138 | 139 | return np.array(iou, dtype=np.float32) 140 | 141 | 142 | def convert_format(boxes_array): 143 | """ 144 | Convert boxes array to shapely.geometry.Polygon format. 145 | Parameters 146 | ---------- 147 | boxes_array : np.ndarray 148 | (N, 4, 2) or (N, 8, 3). 149 | 150 | Returns 151 | ------- 152 | list of converted shapely.geometry.Polygon object. 153 | 154 | """ 155 | polygons = [Polygon([(box[i, 0], box[i, 1]) for i in range(4)]) for box in 156 | boxes_array] 157 | return np.array(polygons) 158 | 159 | 160 | def torch_tensor_to_numpy(torch_tensor): 161 | """ 162 | Convert a torch tensor to numpy. 163 | 164 | Parameters 165 | ---------- 166 | torch_tensor : torch.Tensor 167 | 168 | Returns 169 | ------- 170 | A numpy array. 171 | """ 172 | return torch_tensor.numpy() if not torch_tensor.is_cuda else \ 173 | torch_tensor.cpu().detach().numpy() 174 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_cobevt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from einops import rearrange, repeat 4 | 5 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 6 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 7 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 8 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 9 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 10 | from v2xvit.models.fuse_modules.fuse_utils import regroup 11 | from v2xvit.models.fuse_modules.swap_fusion_modules import \ 12 | SwapFusionEncoder 13 | 14 | class PointPillarCoBEVT(nn.Module): 15 | def __init__(self, args): 16 | super(PointPillarCoBEVT, self).__init__() 17 | 18 | self.max_cav = args['max_cav'] 19 | # PIllar VFE 20 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 21 | num_point_features=4, 22 | voxel_size=args['voxel_size'], 23 | point_cloud_range=args['lidar_range']) 24 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 25 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 26 | # used to downsample the feature map for efficient computation 27 | self.shrink_flag = False 28 | if 'shrink_header' in args: 29 | self.shrink_flag = True 30 | self.shrink_conv = DownsampleConv(args['shrink_header']) 31 | self.compression = False 32 | 33 | if args['compression'] > 0: 34 | self.compression = True 35 | self.naive_compressor = NaiveCompressor(256, args['compression']) 36 | 37 | self.fusion_net = SwapFusionEncoder(args['fax_fusion']) 38 | 39 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 40 | kernel_size=1) 41 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 42 | kernel_size=1) 43 | 44 | if args['backbone_fix']: 45 | self.backbone_fix() 46 | 47 | def backbone_fix(self): 48 | """ 49 | Fix the parameters of backbone during finetune on timedelay。 50 | """ 51 | for p in self.pillar_vfe.parameters(): 52 | p.requires_grad = False 53 | 54 | for p in self.scatter.parameters(): 55 | p.requires_grad = False 56 | 57 | for p in self.backbone.parameters(): 58 | p.requires_grad = False 59 | 60 | if self.compression: 61 | for p in self.naive_compressor.parameters(): 62 | p.requires_grad = False 63 | if self.shrink_flag: 64 | for p in self.shrink_conv.parameters(): 65 | p.requires_grad = False 66 | 67 | for p in self.cls_head.parameters(): 68 | p.requires_grad = False 69 | for p in self.reg_head.parameters(): 70 | p.requires_grad = False 71 | 72 | def forward(self, data_dict): 73 | voxel_features = data_dict['processed_lidar']['voxel_features'] 74 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 75 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 76 | record_len = data_dict['record_len'] 77 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 78 | 79 | batch_dict = {'voxel_features': voxel_features, 80 | 'voxel_coords': voxel_coords, 81 | 'voxel_num_points': voxel_num_points, 82 | 'record_len': record_len} 83 | # n, 4 -> n, c 84 | batch_dict = self.pillar_vfe(batch_dict) 85 | # n, c -> N, C, H, W 86 | batch_dict = self.scatter(batch_dict) 87 | batch_dict = self.backbone(batch_dict) 88 | 89 | spatial_features_2d = batch_dict['spatial_features_2d'] 90 | # downsample feature to reduce memory 91 | if self.shrink_flag: 92 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 93 | # compressor 94 | if self.compression: 95 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 96 | 97 | # N, C, H, W -> B, L, C, H, W 98 | regroup_feature, mask = regroup(spatial_features_2d, 99 | record_len, 100 | self.max_cav) 101 | com_mask = mask.unsqueeze(1).unsqueeze(2).unsqueeze(3) 102 | com_mask = repeat(com_mask, 103 | 'b h w c l -> b (h new_h) (w new_w) c l', 104 | new_h=regroup_feature.shape[3], 105 | new_w=regroup_feature.shape[4]) 106 | 107 | fused_feature = self.fusion_net(regroup_feature, com_mask) 108 | 109 | psm = self.cls_head(fused_feature) 110 | rm = self.reg_head(fused_feature) 111 | 112 | output_dict = {'psm': psm, 113 | 'rm': rm} 114 | 115 | return output_dict 116 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_v2vnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 5 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 6 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 7 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 8 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 9 | from v2xvit.models.sub_modules.v2v_fuse import V2VNetFusion 10 | 11 | 12 | class PointPillarV2VNet(nn.Module): 13 | def __init__(self, args): 14 | super(PointPillarV2VNet, self).__init__() 15 | 16 | self.max_cav = args['max_cav'] 17 | # PIllar VFE 18 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 19 | num_point_features=4, 20 | voxel_size=args['voxel_size'], 21 | point_cloud_range=args['lidar_range']) 22 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 23 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 24 | # used to downsample the feature map for efficient computation 25 | self.shrink_flag = False 26 | if 'shrink_header' in args: 27 | self.shrink_flag = True 28 | self.shrink_conv = DownsampleConv(args['shrink_header']) 29 | self.compression = False 30 | 31 | if args['compression'] > 0: 32 | self.compression = True 33 | self.naive_compressor = NaiveCompressor(256, args['compression']) 34 | 35 | self.fusion_net = V2VNetFusion(args['v2vfusion']) 36 | 37 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 38 | kernel_size=1) 39 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 40 | kernel_size=1) 41 | 42 | if args['backbone_fix']: 43 | self.backbone_fix() 44 | 45 | def backbone_fix(self): 46 | """ 47 | Fix the parameters of backbone during finetune on timedelay。 48 | """ 49 | for p in self.pillar_vfe.parameters(): 50 | p.requires_grad = False 51 | 52 | for p in self.scatter.parameters(): 53 | p.requires_grad = False 54 | 55 | for p in self.backbone.parameters(): 56 | p.requires_grad = False 57 | 58 | if self.compression: 59 | for p in self.naive_compressor.parameters(): 60 | p.requires_grad = False 61 | if self.shrink_flag: 62 | for p in self.shrink_conv.parameters(): 63 | p.requires_grad = False 64 | 65 | for p in self.cls_head.parameters(): 66 | p.requires_grad = False 67 | for p in self.reg_head.parameters(): 68 | p.requires_grad = False 69 | 70 | def unpad_prior_encoding(self, x, record_len): 71 | # remove padded zeros to form tensor with shape (N, 3) 72 | # x: (B, L, 3); record_len: (B) 73 | B = x.shape[0] 74 | out = [] 75 | for i in range(B): 76 | # (valid_len, 3) 77 | out.append(x[i, :record_len[i], :]) 78 | out = torch.cat(out, dim=0) 79 | # (N, 3) 80 | return out 81 | 82 | def forward(self, data_dict): 83 | voxel_features = data_dict['processed_lidar']['voxel_features'] 84 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 85 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 86 | record_len = data_dict['record_len'] 87 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 88 | pairwise_t_matrix = data_dict['pairwise_t_matrix'] 89 | prior_encoding = data_dict['prior_encoding'] 90 | prior_encoding = self.unpad_prior_encoding(prior_encoding, record_len) 91 | 92 | batch_dict = {'voxel_features': voxel_features, 93 | 'voxel_coords': voxel_coords, 94 | 'voxel_num_points': voxel_num_points, 95 | 'record_len': record_len} 96 | # n, 4 -> n, c 97 | batch_dict = self.pillar_vfe(batch_dict) 98 | # n, c -> N, C, H, W 99 | batch_dict = self.scatter(batch_dict) 100 | batch_dict = self.backbone(batch_dict) 101 | 102 | spatial_features_2d = batch_dict['spatial_features_2d'] 103 | # downsample feature to reduce memory 104 | if self.shrink_flag: 105 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 106 | # compressor 107 | if self.compression: 108 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 109 | fused_feature = self.fusion_net(spatial_features_2d, 110 | record_len, 111 | pairwise_t_matrix, 112 | prior_encoding) 113 | 114 | psm = self.cls_head(fused_feature) 115 | rm = self.reg_head(fused_feature) 116 | 117 | output_dict = {'psm': psm, 118 | 'rm': rm} 119 | 120 | return output_dict 121 | -------------------------------------------------------------------------------- /v2xvit/utils/box_overlaps.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | from cython.parallel import prange, parallel 11 | 12 | 13 | DTYPE = np.float32 14 | ctypedef float DTYPE_t 15 | 16 | 17 | def bbox_overlaps( 18 | np.ndarray[DTYPE_t, ndim=2] boxes, 19 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 20 | """ 21 | Parameters 22 | ---------- 23 | boxes: (N, 4) ndarray of float 24 | query_boxes: (K, 4) ndarray of float 25 | Returns 26 | ------- 27 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 28 | """ 29 | cdef unsigned int N = boxes.shape[0] 30 | cdef unsigned int K = query_boxes.shape[0] 31 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 32 | cdef DTYPE_t iw, ih, box_area 33 | cdef DTYPE_t ua 34 | cdef unsigned int k, n 35 | for k in range(K): 36 | box_area = ( 37 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 38 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 39 | ) 40 | for n in range(N): 41 | iw = ( 42 | min(boxes[n, 2], query_boxes[k, 2]) - 43 | max(boxes[n, 0], query_boxes[k, 0]) + 1 44 | ) 45 | if iw > 0: 46 | ih = ( 47 | min(boxes[n, 3], query_boxes[k, 3]) - 48 | max(boxes[n, 1], query_boxes[k, 1]) + 1 49 | ) 50 | if ih > 0: 51 | ua = float( 52 | (boxes[n, 2] - boxes[n, 0] + 1) * 53 | (boxes[n, 3] - boxes[n, 1] + 1) + 54 | box_area - iw * ih 55 | ) 56 | overlaps[n, k] = iw * ih / ua 57 | return overlaps 58 | 59 | def bbox_intersections( 60 | np.ndarray[DTYPE_t, ndim=2] boxes, 61 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 62 | """ 63 | For each query box compute the intersection ratio covered by boxes 64 | ---------- 65 | Parameters 66 | ---------- 67 | boxes: (N, 4) ndarray of float 68 | query_boxes: (K, 4) ndarray of float 69 | Returns 70 | ------- 71 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 72 | """ 73 | cdef unsigned int N = boxes.shape[0] 74 | cdef unsigned int K = query_boxes.shape[0] 75 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 76 | cdef DTYPE_t iw, ih, box_area 77 | cdef DTYPE_t ua 78 | cdef unsigned int k, n 79 | for k in range(K): 80 | box_area = ( 81 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 82 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 83 | ) 84 | for n in range(N): 85 | iw = ( 86 | min(boxes[n, 2], query_boxes[k, 2]) - 87 | max(boxes[n, 0], query_boxes[k, 0]) + 1 88 | ) 89 | if iw > 0: 90 | ih = ( 91 | min(boxes[n, 3], query_boxes[k, 3]) - 92 | max(boxes[n, 1], query_boxes[k, 1]) + 1 93 | ) 94 | if ih > 0: 95 | intersec[n, k] = iw * ih / box_area 96 | return intersec 97 | 98 | # Compute bounding box voting 99 | def box_vote( 100 | np.ndarray[float, ndim=2] dets_NMS, 101 | np.ndarray[float, ndim=2] dets_all): 102 | cdef np.ndarray[float, ndim=2] dets_voted = np.zeros((dets_NMS.shape[0], dets_NMS.shape[1]), dtype=np.float32) 103 | cdef unsigned int N = dets_NMS.shape[0] 104 | cdef unsigned int M = dets_all.shape[0] 105 | 106 | cdef np.ndarray[float, ndim=1] det 107 | cdef np.ndarray[float, ndim=1] acc_box 108 | cdef float acc_score 109 | 110 | cdef np.ndarray[float, ndim=1] det2 111 | cdef float bi0, bi1, bit2, bi3 112 | cdef float iw, ih, ua 113 | 114 | cdef float thresh=0.5 115 | 116 | for i in range(N): 117 | det = dets_NMS[i, :] 118 | acc_box = np.zeros((4), dtype=np.float32) 119 | acc_score = 0.0 120 | 121 | for m in range(M): 122 | det2 = dets_all[m, :] 123 | 124 | bi0 = max(det[0], det2[0]) 125 | bi1 = max(det[1], det2[1]) 126 | bi2 = min(det[2], det2[2]) 127 | bi3 = min(det[3], det2[3]) 128 | 129 | iw = bi2 - bi0 + 1 130 | ih = bi3 - bi1 + 1 131 | 132 | if not (iw > 0 and ih > 0): 133 | continue 134 | 135 | ua = (det[2] - det[0] + 1) * (det[3] - det[1] + 1) + (det2[2] - det2[0] + 1) * (det2[3] - det2[1] + 1) - iw * ih 136 | ov = iw * ih / ua 137 | 138 | if (ov < thresh): 139 | continue 140 | 141 | acc_box += det2[4] * det2[0:4] 142 | acc_score += det2[4] 143 | 144 | dets_voted[i][0:4] = acc_box / acc_score 145 | dets_voted[i][4] = det[4] # Keep the original score 146 | 147 | return dets_voted 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FeaCo 2 | The official implementation of ACM MM2023 paper "FeaCo: Reaching Robust Feature-Level Consensus in Noisy Pose Conditions". 3 | ![FeaCo_Overview](./images/Overview.png) 4 | 5 | > [**FeaCo: Reaching Robust Feature-Level Consensus in Noisy Pose Conditions**](https://doi.org/10.1145/3581783.3611880), 6 | > Jiaming Gu\*, Jingyu Zhang\*, Muyang Zhang, Weiliang Meng, Shibiao Xu, Jiguang Zhang, Xiaopeng Zhang
7 | > *Accepted by ACM MM 2023* 8 | 9 | # Abstract 10 | Collaborative perception offers a promising solution to overcome challenges such as occlusion and long-range data processing. However, limited sensor accuracy leads to noisy poses that misalign observations among vehicles. To address this problem, we propose the FeaCo, which achieves robust Feature-level Consensus among collaborating agents in noisy pose conditions without additional training. We design an efficient Pose-error Rectification Module (PRM) to align derived feature maps from different vehicles, reducing the adverse effect of noisy pose and bandwidth requirements. We also provide an effective multi-scale Cross-level Attention Module (CAM) to enhance information aggregation and interaction between various scales. Our FeaCo outperforms all other localization rectification methods, as validated on both the collaborative perception simulation dataset OPV2V and real-world dataset V2V4Real, reducing heading error and enhancing localization accuracy across various error levels. 11 | 12 | 14 | 15 | ## Installation 16 | ```bash 17 | # Setup conda environment 18 | conda create -f Env.yaml 19 | 20 | conda activate opencood 21 | 22 | # spconv 2.0 install, choose the correct cuda version for you 23 | pip install spconv-cu113 24 | 25 | # Install dependencies 26 | pip install -r requirements.txt 27 | # Install bbx nms calculation cuda version 28 | python v2xvit/utils/setup.py build_ext --inplace 29 | 30 | # install v2xvit into the environment 31 | python setup.py develop 32 | ``` 33 | 34 | ## Data Downloading 35 | All the data can be downloaded from [google drive](https://drive.google.com/drive/folders/1dkDeHlwOVbmgXcDazZvO6TFEZ6V_7WUu). If you have a good internet, you can directly 36 | download the complete large zip file such as `train.zip`. In case you suffer from downloading large files, we also split each data set into small chunks, which can be found 37 | in the directory ending with `_chunks`, such as `train_chunks`. After downloading, please run the following command to each set to merge those chunks together: 38 | ```python 39 | cat train.zip.part* > train.zip 40 | unzip train.zip 41 | ``` 42 | 43 | ## Getting Started 44 | 45 | ### Note: 46 | 47 | - Models and parameters should be trained in perfect environment and tested in noisy environment. 48 | 49 | ### Test with pretrained model 50 | To test the pretrained model of FeaCo, first download the model file from [google url](https://drive.google.com/drive/folders/1reQ7I3jNWRosjpEhVGSSKE2JoLwHIHa4?usp=sharing) and 51 | then put it under v2xvit/logs/opv2v_feaco. Change the `validate_path` in `v2xvit/logs/opv2v_feaco/config.yaml` as `/data/opv2v/test`. 52 | 53 | To test under perfect setting, change `add_noise` to false in the v2xvit/logs/opv2v_feaco/config.yaml. 54 | 55 | To test under noisy setting in our paper, change the `noise_settings` as followings: 56 | ``` 57 | noise_setting: 58 | add_noise: True 59 | args: 60 | pos_std: 1 61 | rot_std: 1 62 | pos_mean: 0 63 | rot_mean: 0 64 | ``` 65 | Eventually, run the following command to perform test: 66 | ```python 67 | python v2xvit/tools/inference.py --model_dir ${CHECKPOINT_FOLDER} 68 | ``` 69 | Arguments Explanation: 70 | - `model_dir`: the path of the checkpoints, e.g. 'v2xvit/logs/opv2v_feaco' for FeaCo testing. 71 | 72 | ### Train your model 73 | FeaCo uses yaml file to configure all the parameters for training. To train your own model 74 | from scratch or a continued checkpoint, run the following commands: 75 | 76 | ```python 77 | python v2xvit/tools/train.py --hypes_yaml ${CONFIG_FILE} [--model_dir ${CHECKPOINT_FOLDER} --half] 78 | ``` 79 | Arguments Explanation: 80 | - `hypes_yaml`: the path of the training configuration file, e.g. `v2xvit/hypes_yaml/where2comm_transformer_multiscale_resnet.yaml` for FeaCo training. 81 | - `model_dir` (optional) : the path of the checkpoints. This is used to fine-tune the trained models. When the `model_dir` is 82 | given, the trainer will discard the `hypes_yaml` and load the `config.yaml` in the checkpoint folder. 83 | - `half`(optional): if specified, hybrid-precision training will be used to save memory occupation. 84 | 85 | ## Citation 86 | If you are using our FeaCo for your research, please cite the following paper: 87 | ```bibtex 88 | @inproceedings{gu2023feaco, 89 | title={FeaCo: Reaching Robust Feature-Level Consensus in Noisy Pose Conditions}, 90 | author={Gu, Jiaming and Zhang, Jingyu and Zhang, Muyang and Meng, Weiliang and Xu, Shibiao and Zhang, Jiguang and Zhang, Xiaopeng}, 91 | booktitle={Proceedings of the 31st ACM International Conference on Multimedia}, 92 | pages={3628--3636}, 93 | year={2023} 94 | } 95 | ``` 96 | 97 | ## Acknowledgment 98 | FeaCo is built upon [OpenCOOD](https://github.com/DerrickXuNu/OpenCOOD) and [V2X-ViT](https://github.com/DerrickXuNu/v2x-vit). 99 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_when2com.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Yue Hu <18671129361@sjtu.edu.cn> 3 | # License: TDG-Attribution-NonCommercial-NoDistrib 4 | 5 | 6 | import torch.nn as nn 7 | import torch 8 | 9 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 10 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 11 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 12 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 13 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 14 | from v2xvit.models.sub_modules.when2com import When2comFusion 15 | 16 | 17 | DEBUG = False 18 | 19 | class PointPillarWhen2com(nn.Module): 20 | def __init__(self, args): 21 | super(PointPillarWhen2com, self).__init__() 22 | 23 | self.max_cav = args['max_cav'] 24 | # PIllar VFE 25 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 26 | num_point_features=4, 27 | voxel_size=args['voxel_size'], 28 | point_cloud_range=args['lidar_range']) 29 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 30 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 31 | 32 | # used to downsample the feature map for efficient computation 33 | self.shrink_flag = False 34 | if 'shrink_header' in args: 35 | self.shrink_flag = True 36 | self.shrink_conv = DownsampleConv(args['shrink_header']) 37 | self.compression = False 38 | 39 | if args['compression'] > 0: 40 | self.compression = True 41 | self.naive_compressor = NaiveCompressor(256, args['compression']) 42 | 43 | self.fusion_net = When2comFusion(args['v2vfusion']) 44 | 45 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 46 | kernel_size=1) 47 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 48 | kernel_size=1) 49 | if args['backbone_fix']: 50 | self.backbone_fix() 51 | 52 | def backbone_fix(self): 53 | """ 54 | Fix the parameters of backbone during finetune on timedelay。 55 | """ 56 | for p in self.pillar_vfe.parameters(): 57 | p.requires_grad = False 58 | 59 | for p in self.scatter.parameters(): 60 | p.requires_grad = False 61 | 62 | for p in self.backbone.parameters(): 63 | p.requires_grad = False 64 | 65 | if self.compression: 66 | for p in self.naive_compressor.parameters(): 67 | p.requires_grad = False 68 | if self.shrink_flag: 69 | for p in self.shrink_conv.parameters(): 70 | p.requires_grad = False 71 | 72 | for p in self.cls_head.parameters(): 73 | p.requires_grad = False 74 | for p in self.reg_head.parameters(): 75 | p.requires_grad = False 76 | 77 | def forward(self, data_dict): 78 | voxel_features = data_dict['processed_lidar']['voxel_features'] 79 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 80 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 81 | record_len = data_dict['record_len'] 82 | # lidar_pose = data_dict['lidar_pose'] # [sum(cav), 6] 83 | 84 | pairwise_t_matrix = data_dict['pairwise_t_matrix'] 85 | 86 | batch_dict = {'voxel_features': voxel_features, 87 | 'voxel_coords': voxel_coords, 88 | 'voxel_num_points': voxel_num_points, 89 | 'record_len': record_len} 90 | 91 | 92 | # n, 4 -> n, c 93 | batch_dict = self.pillar_vfe(batch_dict) 94 | # n, c -> N, C, H, W 95 | batch_dict = self.scatter(batch_dict) 96 | if DEBUG: 97 | origin_feature = torch.clone(batch_dict['spatial_features']) 98 | 99 | batch_dict = self.backbone(batch_dict) 100 | # N, C, H', W'. [N, 256, 50, 176] 101 | spatial_features_2d = batch_dict['spatial_features_2d'] 102 | 103 | # downsample feature to reduce memory 104 | if self.shrink_flag: 105 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 106 | # compressor 107 | if self.compression: 108 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 109 | 110 | # spatial_features_2d is [sum(cav_num), 256, 50, 176] 111 | # output only contains ego 112 | # [B, 256, 50, 176] 113 | if DEBUG: 114 | self.fusion_net.forward_debug(spatial_features_2d, origin_feature,record_len, pairwise_t_matrix) 115 | raise 116 | 117 | 118 | fused_feature = self.fusion_net(spatial_features_2d, 119 | record_len, 120 | pairwise_t_matrix) 121 | 122 | psm = self.cls_head(fused_feature) 123 | rm = self.reg_head(fused_feature) 124 | 125 | output_dict = {'psm': psm, 126 | 'rm': rm} 127 | 128 | return output_dict 129 | -------------------------------------------------------------------------------- /v2xvit/data_utils/pre_processor/voxel_preprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert lidar to voxel 3 | """ 4 | import sys 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from v2xvit.data_utils.pre_processor.base_preprocessor import \ 10 | BasePreprocessor 11 | 12 | 13 | class VoxelPreprocessor(BasePreprocessor): 14 | def __init__(self, preprocess_params, train): 15 | super(VoxelPreprocessor, self).__init__(preprocess_params, train) 16 | self.lidar_range = self.params['cav_lidar_range'] 17 | 18 | self.vw = self.params['args']['vw'] 19 | self.vh = self.params['args']['vh'] 20 | self.vd = self.params['args']['vd'] 21 | self.T = self.params['args']['T'] 22 | 23 | def preprocess(self, pcd_np): 24 | """ 25 | Preprocess the lidar points by voxelization. 26 | 27 | Parameters 28 | ---------- 29 | pcd_np : np.ndarray 30 | The raw lidar. 31 | 32 | Returns 33 | ------- 34 | data_dict : the structured output dictionary. 35 | """ 36 | data_dict = {} 37 | 38 | # calculate the voxel coordinates 39 | voxel_coords = ((pcd_np[:, :3] - 40 | np.floor(np.array([self.lidar_range[0], 41 | self.lidar_range[1], 42 | self.lidar_range[2]])) / ( 43 | self.vw, self.vh, self.vd))).astype(np.int32) 44 | 45 | # convert to (D, H, W) as the paper 46 | voxel_coords = voxel_coords[:, [2, 1, 0]] 47 | voxel_coords, inv_ind, voxel_counts = np.unique(voxel_coords, axis=0, 48 | return_inverse=True, 49 | return_counts=True) 50 | 51 | voxel_features = [] 52 | 53 | for i in range(len(voxel_coords)): 54 | voxel = np.zeros((self.T, 7), dtype=np.float32) 55 | pts = pcd_np[inv_ind == i] 56 | if voxel_counts[i] > self.T: 57 | pts = pts[:self.T, :] 58 | voxel_counts[i] = self.T 59 | 60 | # augment the points 61 | voxel[:pts.shape[0], :] = np.concatenate((pts, pts[:, :3] - 62 | np.mean(pts[:, :3], 0)), 63 | axis=1) 64 | voxel_features.append(voxel) 65 | 66 | data_dict['voxel_features'] = np.array(voxel_features) 67 | data_dict['voxel_coords'] = voxel_coords 68 | 69 | return data_dict 70 | 71 | def collate_batch(self, batch): 72 | """ 73 | Customized pytorch data loader collate function. 74 | 75 | Parameters 76 | ---------- 77 | batch : list or dict 78 | List or dictionary. 79 | 80 | Returns 81 | ------- 82 | processed_batch : dict 83 | Updated lidar batch. 84 | """ 85 | 86 | if isinstance(batch, list): 87 | return self.collate_batch_list(batch) 88 | elif isinstance(batch, dict): 89 | return self.collate_batch_dict(batch) 90 | else: 91 | sys.exit('Batch has too be a list or a dictionarn') 92 | 93 | @staticmethod 94 | def collate_batch_list(batch): 95 | """ 96 | Customized pytorch data loader collate function. 97 | 98 | Parameters 99 | ---------- 100 | batch : list 101 | List of dictionary. Each dictionary represent a single frame. 102 | 103 | Returns 104 | ------- 105 | processed_batch : dict 106 | Updated lidar batch. 107 | """ 108 | voxel_features = [] 109 | voxel_coords = [] 110 | 111 | for i in range(len(batch)): 112 | voxel_features.append(batch[i]['voxel_features']) 113 | coords = batch[i]['voxel_coords'] 114 | voxel_coords.append( 115 | np.pad(coords, ((0, 0), (1, 0)), 116 | mode='constant', constant_values=i)) 117 | 118 | voxel_features = torch.from_numpy(np.concatenate(voxel_features)) 119 | voxel_coords = torch.from_numpy(np.concatenate(voxel_coords)) 120 | 121 | return {'voxel_features': voxel_features, 122 | 'voxel_coords': voxel_coords} 123 | 124 | @staticmethod 125 | def collate_batch_dict(batch: dict): 126 | """ 127 | Collate batch if the batch is a dictionary, 128 | eg: {'voxel_features': [feature1, feature2...., feature n]} 129 | 130 | Parameters 131 | ---------- 132 | batch : dict 133 | 134 | Returns 135 | ------- 136 | processed_batch : dict 137 | Updated lidar batch. 138 | """ 139 | voxel_features = \ 140 | torch.from_numpy(np.concatenate(batch['voxel_features'])) 141 | coords = batch['voxel_coords'] 142 | voxel_coords = [] 143 | 144 | for i in range(len(coords)): 145 | voxel_coords.append( 146 | np.pad(coords[i], ((0, 0), (1, 0)), 147 | mode='constant', constant_values=i)) 148 | voxel_coords = torch.from_numpy(np.concatenate(voxel_coords)) 149 | 150 | return {'voxel_features': voxel_features, 151 | 'voxel_coords': voxel_coords} 152 | -------------------------------------------------------------------------------- /v2xvit/visualization/simple_vis.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import numpy as np 3 | 4 | import v2xvit.visualization.simple_plot3d.canvas_3d as canvas_3d 5 | import v2xvit.visualization.simple_plot3d.canvas_bev as canvas_bev 6 | 7 | def visualize(pred_box_tensor, gt_tensor, pcd, pc_range, save_path, method='3d', vis_gt_box=True, vis_pred_box=True, left_hand=False, uncertainty=None): 8 | """ 9 | Visualize the prediction, ground truth with point cloud together. 10 | They may be flipped in y axis. Since carla is left hand coordinate, while kitti is right hand. 11 | 12 | Parameters 13 | ---------- 14 | pred_box_tensor : torch.Tensor 15 | (N, 8, 3) prediction. 16 | 17 | gt_tensor : torch.Tensor 18 | (N, 8, 3) groundtruth bbx 19 | 20 | pcd : torch.Tensor 21 | PointCloud, (N, 4). 22 | 23 | pc_range : list 24 | [xmin, ymin, zmin, xmax, ymax, zmax] 25 | 26 | save_path : str 27 | Save the visualization results to given path. 28 | 29 | dataset : BaseDataset 30 | opencood dataset object. 31 | 32 | method: str, 'bev' or '3d' 33 | 34 | """ 35 | 36 | pc_range = [int(i) for i in pc_range] 37 | if isinstance(pcd, list): 38 | pcd_np = [x.cpu().numpy() for x in pcd] 39 | else: 40 | pcd_np = pcd.cpu().numpy() 41 | 42 | if pred_box_tensor == None: 43 | vis_pred_box = False 44 | # vis_gt_box = False 45 | 46 | if vis_pred_box: 47 | pred_box_np = pred_box_tensor.cpu().numpy() 48 | # pred_name = ['pred'] * pred_box_np.shape[0] 49 | pred_name = [''] * pred_box_np.shape[0] 50 | if uncertainty is not None: 51 | uncertainty_np = uncertainty.cpu().numpy() 52 | uncertainty_np = np.exp(uncertainty_np) 53 | d_a_square = 1.6**2 + 3.9**2 54 | 55 | if uncertainty_np.shape[1] == 3: 56 | uncertainty_np[:,:2] *= d_a_square 57 | uncertainty_np = np.sqrt(uncertainty_np) 58 | # yaw angle is in radian, it's the same in g2o SE2's setting. 59 | 60 | pred_name = [f'x_u:{uncertainty_np[i,0]:.3f} y_u:{uncertainty_np[i,1]:.3f} a_u:{uncertainty_np[i,2]:.3f}' \ 61 | for i in range(uncertainty_np.shape[0])] 62 | 63 | elif uncertainty_np.shape[1] == 2: 64 | uncertainty_np[:,:2] *= d_a_square 65 | uncertainty_np = np.sqrt(uncertainty_np) # yaw angle is in radian 66 | 67 | pred_name = [f'x_u:{uncertainty_np[i,0]:.3f} y_u:{uncertainty_np[i,1]:3f}' \ 68 | for i in range(uncertainty_np.shape[0])] 69 | 70 | elif uncertainty_np.shape[1] == 7: 71 | uncertainty_np[:,:2] *= d_a_square 72 | uncertainty_np = np.sqrt(uncertainty_np) # yaw angle is in radian 73 | 74 | pred_name = [f'x_u:{uncertainty_np[i,0]:.3f} y_u:{uncertainty_np[i,1]:3f} a_u:{uncertainty_np[i,6]:3f}' \ 75 | for i in range(uncertainty_np.shape[0])] 76 | 77 | if vis_gt_box: 78 | gt_box_np = gt_tensor.cpu().numpy() 79 | # gt_name = ['gt'] * gt_box_np.shape[0] 80 | gt_name = [''] * gt_box_np.shape[0] 81 | 82 | if method == 'bev': 83 | canvas = canvas_bev.Canvas_BEV_heading_right(canvas_shape=((pc_range[4]-pc_range[1])*10, (pc_range[3]-pc_range[0])*10), 84 | canvas_x_range=(pc_range[0], pc_range[3]), 85 | canvas_y_range=(pc_range[1], pc_range[4]), 86 | left_hand=left_hand 87 | ) 88 | 89 | canvas_xy, valid_mask = canvas.get_canvas_coords(pcd_np) # Get Canvas Coords 90 | canvas.draw_canvas_points(canvas_xy[valid_mask]) 91 | # color_list = [(0, 206, 209),(255, 215,0)] 92 | # for i, pcd_np_t in enumerate(pcd_np[1:2]): 93 | # canvas_xy, valid_mask = canvas.get_canvas_coords(pcd_np_t) # Get Canvas Coords 94 | # canvas.draw_canvas_points(canvas_xy[valid_mask], colors=color_list[i-1]) # Only draw valid points 95 | box_line_thickness = 5 96 | if vis_gt_box: 97 | # canvas.draw_boxes(gt_box_np,colors=(0,255,0), texts=gt_name) 98 | canvas.draw_boxes(gt_box_np,colors=(0,255,0), texts=gt_name, box_line_thickness=box_line_thickness) 99 | 100 | if vis_pred_box: 101 | canvas.draw_boxes(pred_box_np, colors=(255,0,0), texts=pred_name, box_line_thickness=box_line_thickness) 102 | 103 | elif method == '3d': 104 | canvas = canvas_3d.Canvas_3D(left_hand=left_hand) 105 | canvas_xy, valid_mask = canvas.get_canvas_coords(pcd_np) 106 | canvas.draw_canvas_points(canvas_xy[valid_mask]) 107 | 108 | if vis_gt_box: 109 | canvas.draw_boxes(gt_box_np,colors=(0,255,0), texts=gt_name) 110 | if vis_pred_box: 111 | canvas.draw_boxes(pred_box_np, colors=(255,0,0), texts=pred_name) 112 | else: 113 | raise(f"Not Completed for f{method} visualization.") 114 | 115 | plt.axis("off") 116 | 117 | plt.imshow(canvas.canvas) 118 | 119 | plt.tight_layout() 120 | plt.savefig(save_path, transparent=False, dpi=400, pad_inches=0.0) 121 | plt.clf() 122 | # print(save_path) 123 | -------------------------------------------------------------------------------- /v2xvit/models/point_pillar_transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from v2xvit.models.sub_modules.pillar_vfe import PillarVFE 5 | from v2xvit.models.sub_modules.point_pillar_scatter import PointPillarScatter 6 | from v2xvit.models.sub_modules.base_bev_backbone import BaseBEVBackbone 7 | from v2xvit.models.sub_modules.fuse_utils import regroup 8 | from v2xvit.models.sub_modules.downsample_conv import DownsampleConv 9 | from v2xvit.models.sub_modules.naive_compress import NaiveCompressor 10 | from v2xvit.models.sub_modules.v2xvit_basic import V2XTransformer 11 | 12 | 13 | class PointPillarTransformer(nn.Module): 14 | def __init__(self, args): 15 | super(PointPillarTransformer, self).__init__() 16 | 17 | self.max_cav = args['max_cav'] 18 | # PIllar VFE 19 | self.pillar_vfe = PillarVFE(args['pillar_vfe'], 20 | num_point_features=4, 21 | voxel_size=args['voxel_size'], 22 | point_cloud_range=args['lidar_range']) 23 | self.scatter = PointPillarScatter(args['point_pillar_scatter']) 24 | self.backbone = BaseBEVBackbone(args['base_bev_backbone'], 64) 25 | # used to downsample the feature map for efficient computation 26 | self.shrink_flag = False 27 | if 'shrink_header' in args: 28 | self.shrink_flag = True 29 | self.shrink_conv = DownsampleConv(args['shrink_header']) 30 | self.compression = False 31 | 32 | if args['compression'] > 0: 33 | self.compression = True 34 | self.naive_compressor = NaiveCompressor(256, args['compression']) 35 | 36 | self.fusion_net = V2XTransformer(args['transformer']) 37 | 38 | self.cls_head = nn.Conv2d(128 * 2, args['anchor_number'], 39 | kernel_size=1) 40 | self.reg_head = nn.Conv2d(128 * 2, 7 * args['anchor_number'], 41 | kernel_size=1) 42 | 43 | if args['backbone_fix']: 44 | self.backbone_fix() 45 | 46 | def backbone_fix(self): 47 | """ 48 | Fix the parameters of backbone during finetune on timedelay。 49 | """ 50 | for p in self.pillar_vfe.parameters(): 51 | p.requires_grad = False 52 | 53 | for p in self.scatter.parameters(): 54 | p.requires_grad = False 55 | 56 | for p in self.backbone.parameters(): 57 | p.requires_grad = False 58 | 59 | if self.compression: 60 | for p in self.naive_compressor.parameters(): 61 | p.requires_grad = False 62 | if self.shrink_flag: 63 | for p in self.shrink_conv.parameters(): 64 | p.requires_grad = False 65 | 66 | for p in self.cls_head.parameters(): 67 | p.requires_grad = False 68 | for p in self.reg_head.parameters(): 69 | p.requires_grad = False 70 | 71 | def forward(self, data_dict): 72 | voxel_features = data_dict['processed_lidar']['voxel_features'] 73 | voxel_coords = data_dict['processed_lidar']['voxel_coords'] 74 | voxel_num_points = data_dict['processed_lidar']['voxel_num_points'] 75 | record_len = data_dict['record_len'] 76 | spatial_correction_matrix = data_dict['spatial_correction_matrix'] 77 | 78 | # B, max_cav, 3(dt dv infra), 1, 1 79 | prior_encoding =\ 80 | data_dict['prior_encoding'].unsqueeze(-1).unsqueeze(-1) 81 | 82 | batch_dict = {'voxel_features': voxel_features, 83 | 'voxel_coords': voxel_coords, 84 | 'voxel_num_points': voxel_num_points, 85 | 'record_len': record_len} 86 | # n, 4 -> n, c 87 | batch_dict = self.pillar_vfe(batch_dict) 88 | # n, c -> N, C, H, W 89 | batch_dict = self.scatter(batch_dict) 90 | batch_dict = self.backbone(batch_dict) 91 | 92 | spatial_features_2d = batch_dict['spatial_features_2d'] 93 | # downsample feature to reduce memory 94 | if self.shrink_flag: 95 | spatial_features_2d = self.shrink_conv(spatial_features_2d) 96 | # compressor 97 | if self.compression: 98 | spatial_features_2d = self.naive_compressor(spatial_features_2d) 99 | # N, C, H, W -> B, L, C, H, W 100 | regroup_feature, mask = regroup(spatial_features_2d, 101 | record_len, 102 | self.max_cav) 103 | # prior encoding added 104 | prior_encoding = prior_encoding.repeat(1, 1, 1, 105 | regroup_feature.shape[3], 106 | regroup_feature.shape[4]) 107 | regroup_feature = torch.cat([regroup_feature, prior_encoding], dim=2) 108 | 109 | # b l c h w -> b l h w c 110 | regroup_feature = regroup_feature.permute(0, 1, 3, 4, 2) 111 | # transformer fusion 112 | fused_feature = self.fusion_net(regroup_feature, mask, spatial_correction_matrix) 113 | # b h w c -> b c h w 114 | fused_feature = fused_feature.permute(0, 3, 1, 2) 115 | 116 | psm = self.cls_head(fused_feature) 117 | rm = self.reg_head(fused_feature) 118 | 119 | output_dict = {'psm': psm, 120 | 'rm': rm} 121 | 122 | return output_dict 123 | -------------------------------------------------------------------------------- /v2xvit/data_utils/pre_processor/sp_voxel_preprocessor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transform points to voxels using sparse conv library 3 | """ 4 | import sys 5 | 6 | import numpy as np 7 | import torch 8 | from cumm import tensorview as tv 9 | from spconv.utils import Point2VoxelCPU3d 10 | 11 | from v2xvit.data_utils.pre_processor.base_preprocessor import \ 12 | BasePreprocessor 13 | 14 | 15 | class SpVoxelPreprocessor(BasePreprocessor): 16 | def __init__(self, preprocess_params, train): 17 | super(SpVoxelPreprocessor, self).__init__(preprocess_params, 18 | train) 19 | 20 | self.lidar_range = self.params['cav_lidar_range'] 21 | self.voxel_size = self.params['args']['voxel_size'] 22 | self.max_points_per_voxel = self.params['args']['max_points_per_voxel'] 23 | 24 | if train: 25 | self.max_voxels = self.params['args']['max_voxel_train'] 26 | else: 27 | self.max_voxels = self.params['args']['max_voxel_test'] 28 | 29 | grid_size = (np.array(self.lidar_range[3:6]) - 30 | np.array(self.lidar_range[0:3])) / np.array(self.voxel_size) 31 | self.grid_size = np.round(grid_size).astype(np.int64) 32 | 33 | # use sparse conv library to generate voxel 34 | self.voxel_generator = Point2VoxelCPU3d( 35 | vsize_xyz=self.voxel_size, 36 | coors_range_xyz=self.lidar_range, 37 | max_num_points_per_voxel=self.max_points_per_voxel, 38 | num_point_features=4, 39 | max_num_voxels=self.max_voxels 40 | ) 41 | 42 | def preprocess(self, pcd_np): 43 | data_dict = {} 44 | pcd_tv = tv.from_numpy(pcd_np) 45 | voxel_output = self.voxel_generator.point_to_voxel(pcd_tv) 46 | if isinstance(voxel_output, dict): 47 | voxels, coordinates, num_points = \ 48 | voxel_output['voxels'], voxel_output['coordinates'], \ 49 | voxel_output['num_points_per_voxel'] 50 | else: 51 | voxels, coordinates, num_points = voxel_output 52 | 53 | data_dict['voxel_features'] = voxels.numpy() 54 | data_dict['voxel_coords'] = coordinates.numpy() 55 | data_dict['voxel_num_points'] = num_points.numpy() 56 | 57 | return data_dict 58 | 59 | def collate_batch(self, batch): 60 | """ 61 | Customized pytorch data loader collate function. 62 | 63 | Parameters 64 | ---------- 65 | batch : list or dict 66 | List or dictionary. 67 | 68 | Returns 69 | ------- 70 | processed_batch : dict 71 | Updated lidar batch. 72 | """ 73 | 74 | if isinstance(batch, list): 75 | return self.collate_batch_list(batch) 76 | elif isinstance(batch, dict): 77 | return self.collate_batch_dict(batch) 78 | else: 79 | sys.exit('Batch has too be a list or a dictionarn') 80 | 81 | @staticmethod 82 | def collate_batch_list(batch): 83 | """ 84 | Customized pytorch data loader collate function. 85 | 86 | Parameters 87 | ---------- 88 | batch : list 89 | List of dictionary. Each dictionary represent a single frame. 90 | 91 | Returns 92 | ------- 93 | processed_batch : dict 94 | Updated lidar batch. 95 | """ 96 | voxel_features = [] 97 | voxel_num_points = [] 98 | voxel_coords = [] 99 | 100 | for i in range(len(batch)): 101 | voxel_features.append(batch[i]['voxel_features']) 102 | voxel_num_points.append(batch[i]['voxel_num_points']) 103 | coords = batch[i]['voxel_coords'] 104 | voxel_coords.append( 105 | np.pad(coords, ((0, 0), (1, 0)), 106 | mode='constant', constant_values=i)) 107 | 108 | voxel_num_points = torch.from_numpy(np.concatenate(voxel_num_points)) 109 | voxel_features = torch.from_numpy(np.concatenate(voxel_features)) 110 | voxel_coords = torch.from_numpy(np.concatenate(voxel_coords)) 111 | 112 | return {'voxel_features': voxel_features, 113 | 'voxel_coords': voxel_coords, 114 | 'voxel_num_points': voxel_num_points} 115 | 116 | @staticmethod 117 | def collate_batch_dict(batch: dict): 118 | """ 119 | Collate batch if the batch is a dictionary, 120 | eg: {'voxel_features': [feature1, feature2...., feature n]} 121 | 122 | Parameters 123 | ---------- 124 | batch : dict 125 | 126 | Returns 127 | ------- 128 | processed_batch : dict 129 | Updated lidar batch. 130 | """ 131 | voxel_features = \ 132 | torch.from_numpy(np.concatenate(batch['voxel_features'])) 133 | voxel_num_points = \ 134 | torch.from_numpy(np.concatenate(batch['voxel_num_points'])) 135 | coords = batch['voxel_coords'] 136 | voxel_coords = [] 137 | 138 | for i in range(len(coords)): 139 | voxel_coords.append( 140 | np.pad(coords[i], ((0, 0), (1, 0)), 141 | mode='constant', constant_values=i)) 142 | voxel_coords = torch.from_numpy(np.concatenate(voxel_coords)) 143 | 144 | return {'voxel_features': voxel_features, 145 | 'voxel_coords': voxel_coords, 146 | 'voxel_num_points': voxel_num_points} 147 | --------------------------------------------------------------------------------