├── .gitignore ├── .gitkeep ├── .isort.cfg ├── .pylintrc ├── .style.yapf ├── LICENSE.md ├── Makefile ├── README.md ├── configs ├── common │ ├── augmentation.yaml │ ├── optimizer.yaml │ ├── test.yaml │ ├── test_dataloader.yaml │ └── train_dataloader.yaml ├── defaults.yaml ├── evaluators │ ├── kitti_3d.yaml │ └── nuscenes.yaml ├── experiments │ ├── dd3d_kitti_dla34.yaml │ ├── dd3d_kitti_dla34_overfit.yaml │ ├── dd3d_kitti_omninets.yaml │ ├── dd3d_kitti_regnety_006_bifpn.yaml │ ├── dd3d_kitti_v99.yaml │ ├── dd3d_nusc_dla34.yaml │ └── dd3d_nusc_v99.yaml ├── feature_extractors │ ├── d2_fpn.yaml │ ├── dla34_fpn.yaml │ ├── omninet_big.yaml │ ├── omninet_small.yaml │ └── v2_99_fpn.yaml ├── meta_arch │ └── dd3d.yaml ├── models │ ├── dd3d.yaml │ └── depth_head.yaml ├── test_datasets │ ├── base_test_dataset.yaml │ ├── kitti_3d.yaml │ └── nuscenes.yaml ├── train_datasets │ ├── base_train_dataset.yaml │ ├── kitti_3d.yaml │ └── nuscenes.yaml ├── visualize_dataloader.yaml └── visualizers │ ├── base_visualizer.yaml │ ├── box3d.yaml │ ├── common.yaml │ └── d2.yaml ├── docker ├── Dockerfile └── Dockerfile-cu111 ├── media └── figs │ ├── demo_dd3d_kitti_val_short.gif │ └── tri-logo.png ├── scripts ├── train.py └── visualize_dataloader.py └── tridet ├── __init__.py ├── data ├── __init__.py ├── augmentations │ ├── __init__.py │ ├── build.py │ ├── color_transform.py │ ├── crop_transform.py │ ├── flip_transform.py │ └── resize_transform.py ├── build.py ├── dataset_mappers │ ├── __init__.py │ ├── dataset_mapper.py │ └── nuscenes_mapper.py ├── datasets │ ├── __init__.py │ ├── kitti_3d │ │ ├── __init__.py │ │ └── build.py │ └── nuscenes │ │ ├── __init__.py │ │ └── build.py ├── samplers │ ├── __init__.py │ └── group_sampler.py └── transform_utils.py ├── evaluators ├── __init__.py ├── kitti_3d_evaluator.py ├── nuscenes_evaluator.py └── rotate_iou.py ├── layers ├── __init__.py ├── bev_nms.py ├── conv_bn_fpn_layers.py ├── iou_loss.py ├── normalization.py ├── separable_conv2d.py └── smooth_l1_loss.py ├── modeling ├── __init__.py ├── backbone │ └── omni_scripts │ │ ├── __init__.py │ │ ├── act.py │ │ ├── backbone_with_fpn.py │ │ ├── fpn.py │ │ ├── fused_mb_nets.py │ │ ├── norm.py │ │ ├── omninet_w1.0.py │ │ ├── omninet_w1.3.py │ │ ├── ops.py │ │ └── utils.py ├── dd3d │ ├── __init__.py │ ├── core.py │ ├── dense_depth.py │ ├── dense_depth_loss.py │ ├── depth.py │ ├── disentangled_box3d_loss.py │ ├── fcos2d.py │ ├── fcos3d.py │ ├── nuscenes_dd3d.py │ ├── nuscenes_dd3d_tta.py │ ├── postprocessing.py │ ├── prepare_targets.py │ ├── test_time_augmentation.py │ └── utils.py └── feature_extractor │ ├── __init__.py │ ├── dla.py │ └── vovnet.py ├── structures ├── __init__.py ├── boxes3d.py ├── image_list.py └── pose.py ├── utils ├── coco.py ├── comm.py ├── events.py ├── geometry.py ├── hydra │ └── callbacks.py ├── s3.py ├── setup.py ├── tasks.py ├── tensor2d.py ├── train.py ├── visualization.py └── wandb.py └── visualizers ├── __init__.py ├── bev.py ├── box3d_visualizer.py └── d2_visualizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | outputs/ 3 | wandb/ 4 | 5 | # cluster hostfiles 6 | hostfiles/ 7 | 8 | # Raw files 9 | *.jpg 10 | *.png 11 | *.txt 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.so 18 | build/ 19 | dist/ 20 | 21 | # pytorch/python/numpy formats 22 | *.pth 23 | *.pkl 24 | *.npy 25 | 26 | # ipython/jupyter notebooks 27 | *.ipynb 28 | **/.ipynb_checkpoints/ 29 | 30 | # Editor temporaries 31 | *.swn 32 | *.swo 33 | *.swp 34 | *~ 35 | -------------------------------------------------------------------------------- /.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TRI-ML/dd3d/56fad8ec9eb7fbd373953f49f9372120b4cd740c/.gitkeep -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=6 3 | line_length=120 4 | 5 | sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER,myself 6 | known_third_party=torch, pandas, numpy, matplotlib, cv2, mpi4py, tqdm, pyquaternion, click, scipy, hydra, fvcore, seaborn, pycocotools, diskcache, xarray, pytorch3d, nuscenes, pyquaternion, iopath, wandb 7 | 8 | known_myself=tridet 9 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | # Copyright 2016-2018 Toyota Research Institute. All rights reserved. 2 | # pylintrc config file based on driving/src/utils/pylintrc 3 | [MASTER] 4 | accept-no-param-doc=no 5 | accept-no-return-doc=yes 6 | accept-no-yields-doc=yes 7 | 8 | [REPORTS] 9 | reports=no 10 | # Make errors emacs-compatible 11 | msg-template='{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}' 12 | 13 | [TYPECHECK] 14 | ignored-classes= 15 | PurePath 16 | 17 | [MESSAGES CONTROL] 18 | disable= 19 | # We do not want lazy logging, as error during lazy logging are caught and 20 | # ignored. Not a good recipe for reliable logging. 21 | logging-not-lazy, 22 | # also allow .format in logging calls 23 | logging-format-interpolation, 24 | # Do not enforce "refactor" rules 25 | R, 26 | C, duplicate-code, 27 | # Temporary disable complexity checks. 28 | too-many-instance-attributes, too-many-branches, too-many-statements, 29 | too-many-arguments, too-many-locals, 30 | # Do not complain on fixme/TODO's 31 | fixme, 32 | # Do not complain if we locally disabled a rule 33 | locally-disabled, 34 | # We do not care if we have too few public methods. 35 | too-few-public-methods, 36 | # We do not care if overridden methods use different arguments. 37 | arguments-differ, 38 | # Since this doesn't check control-flow, it has lots of false positives. 39 | invalid-unary-operand-type, 40 | not-callable, 41 | no-member, 42 | protected-access, 43 | attribute-defined-outside-init, 44 | global-statement, 45 | W0123, 46 | W1401, 47 | # allow multiple arguments for string formatting 48 | E1305, 49 | # bypass import error in setup.py 50 | E0401, 51 | E0611, 52 | # suppress Lambda warnings 53 | W0108, 54 | # allow un-implemented abstract method in sub-classes 55 | W0223, 56 | # allow explicit return in __init__ 57 | E0101, 58 | # (dennis.park) Use f-strings in logging. 59 | W1203 60 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = pep8 3 | indent_width = 4 4 | column_limit = 120 5 | arithmetic_precedence_indication = false 6 | spaces_before_comment = 2 7 | split_complex_comprehension = true 8 | split_penalty_comprehension = 2100 9 | blank_line_before_nested_class_or_def = false 10 | align_closing_bracket_with_visual_indent = true 11 | dedent_closing_brackets = true 12 | coalesce_brackets = true 13 | join_multiple_lines = false -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Toyota Research Institute (TRI) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = dd3d 2 | WORKSPACE = /workspace/$(PROJECT) 3 | DOCKER_IMAGE = $(PROJECT):latest 4 | DOCKERFILE ?= Dockerfile 5 | 6 | DOCKER_OPTS = \ 7 | -it \ 8 | --rm \ 9 | -e DISPLAY=${DISPLAY} \ 10 | -v /data:/data \ 11 | -v /tmp:/tmp \ 12 | -v /tmp/.X11-unix:/tmp/.X11-unix \ 13 | -v /mnt/fsx:/mnt/fsx \ 14 | -v /root/.ssh:/root/.ssh \ 15 | -v ~/.aws:/root/.aws \ 16 | --shm-size=1G \ 17 | --ipc=host \ 18 | --network=host \ 19 | --privileged 20 | 21 | DOCKER_BUILD_ARGS = \ 22 | --build-arg WORKSPACE=$(WORKSPACE) \ 23 | --build-arg AWS_ACCESS_KEY_ID \ 24 | --build-arg AWS_SECRET_ACCESS_KEY \ 25 | --build-arg AWS_DEFAULT_REGION \ 26 | --build-arg WANDB_ENTITY \ 27 | --build-arg WANDB_API_KEY \ 28 | 29 | NGPUS ?= $(shell nvidia-smi -L | wc -l) 30 | MASTER_ADDR ?= 127.0.0.1 31 | MPI_HOSTS ?= localhost:${NGPUS} 32 | MPI_CMD=mpirun \ 33 | -x LD_LIBRARY_PATH \ 34 | -x PYTHONPATH \ 35 | -x MASTER_ADDR=${MASTER_ADDR} \ 36 | -x NCCL_LL_THRESHOLD=0 \ 37 | -x AWS_ACCESS_KEY_ID \ 38 | -x AWS_SECRET_ACCESS_KEY \ 39 | -x WANDB_ENTITY \ 40 | -x WANDB_API_KEY \ 41 | -np ${NGPUS} \ 42 | -H ${MPI_HOSTS} \ 43 | -x NCCL_SOCKET_IFNAME=^docker0,lo \ 44 | --mca btl_tcp_if_exclude docker0,lo \ 45 | -mca plm_rsh_args 'p 12345' \ 46 | --allow-run-as-root 47 | 48 | docker-build: 49 | docker build \ 50 | $(DOCKER_BUILD_ARGS) \ 51 | -f ./docker/$(DOCKERFILE) \ 52 | -t $(DOCKER_IMAGE) . 53 | 54 | docker-dev: 55 | nvidia-docker run --name $(PROJECT) \ 56 | $(DOCKER_OPTS) \ 57 | -v $(PWD):$(WORKSPACE) \ 58 | $(DOCKER_IMAGE) bash 59 | 60 | dist-run: 61 | nvidia-docker run --name $(PROJECT) --rm \ 62 | -e DISPLAY=${DISPLAY} \ 63 | -v ~/.torch:/root/.torch \ 64 | ${DOCKER_OPTS} \ 65 | -v $(PWD):$(WORKSPACE) \ 66 | ${DOCKER_IMAGE} \ 67 | ${COMMAND} 68 | 69 | docker-run: docker-build 70 | nvidia-docker run --name $(PROJECT) --rm \ 71 | ${DOCKER_OPTS} \ 72 | ${DOCKER_IMAGE} \ 73 | ${COMMAND} 74 | 75 | docker-run-mpi: docker-build 76 | nvidia-docker run ${DOCKER_OPTS} -v $(PWD)/outputs:$(WORKSPACE)/outputs ${DOCKER_IMAGE} \ 77 | bash -c "${MPI_CMD} ${COMMAND}" 78 | 79 | clean: 80 | find . -name '"*.pyc' | xargs sudo rm -f && \ 81 | find . -name '__pycache__' | xargs sudo rm -rf 82 | -------------------------------------------------------------------------------- /configs/common/augmentation.yaml: -------------------------------------------------------------------------------- 1 | # If `True`, then selectively enable data augmentation. 2 | # If `False`, then disable the entire data augmentation. 3 | AUG_ENABLED: True 4 | 5 | # 1) Resize 6 | RESIZE: 7 | ENABLED: True 8 | # Size of the smallest side of the image during training 9 | MIN_SIZE_TRAIN: ??? 10 | # Sample size of smallest side by choice or random selection from range give by 11 | MIN_SIZE_TRAIN_SAMPLING: "choice" 12 | # Maximum size of the side of the image during training 13 | MAX_SIZE_TRAIN: ??? 14 | # Size of the smallest side of the image during testing. Set to zero to disable resize in testing. 15 | MIN_SIZE_TEST: ??? 16 | # Maximum size of the side of the image during testing 17 | MAX_SIZE_TEST: ??? 18 | 19 | # 2) Crop 20 | CROP: 21 | # `True` if cropping is used for data augmentation during training 22 | ENABLED: False 23 | # Cropping type: 24 | # - "relative" crop (H * CROP.SIZE[0], W * CROP.SIZE[1]) part of an input of size (H, W) 25 | # - "relative_range" uniformly sample relative crop size from between [CROP.SIZE[0], [CROP.SIZE[1]]. 26 | # and [1, 1] and use it as in "relative" scenario. 27 | # - "absolute" crop part of an input with absolute size: (CROP.SIZE[0], CROP.SIZE[1]). 28 | # - "absolute_range", for an input of size (H, W), uniformly sample H_crop in 29 | # [CROP.SIZE[0], min(H, CROP.SIZE[1])] and W_crop in [CROP.SIZE[0], min(W, CROP.SIZE[1])] 30 | TYPE: "relative_range" 31 | # Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of 32 | # pixels if CROP.TYPE is "absolute" 33 | SIZE: [0.9, 0.9] 34 | 35 | # 3) Flip. 36 | RANDOM_FLIP: 37 | # NOTE: Unlike d2, RandomFlip is configurable 38 | ENABLED: True 39 | HORIZONTAL: True 40 | VERTICAL: False 41 | 42 | # 4) Color jittering 43 | COLOR_JITTER: 44 | ENABLED: True 45 | BRIGHTNESS: [0.2, 0.2] 46 | SATURATION: [0.2, 0.2] 47 | CONTRAST: [0.2, 0.2] 48 | -------------------------------------------------------------------------------- /configs/common/optimizer.yaml: -------------------------------------------------------------------------------- 1 | # Number of images per batch across all machines. 2 | # If we have 16 GPUs and IMS_PER_BATCH = 32, 3 | # each GPU will see 2 images per batch. 4 | # May be adjusted automatically if REFERENCE_WORLD_SIZE is set. 5 | IMS_PER_BATCH: 16 6 | 7 | # Update scheme of torch.optim.SGD: 8 | # https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py#L34 9 | BASE_LR: 0.001 10 | MOMENTUM: 0.9 11 | 12 | NESTEROV: False 13 | 14 | WEIGHT_DECAY: 0.0001 15 | # The weight decay that's applied to parameters of normalization layers 16 | # (typically the affine transformation) 17 | WEIGHT_DECAY_NORM: 0.0 18 | 19 | # Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for 20 | # biases. This is not useful (at least for recent models). You should avoid 21 | # changing these and they exist only to reproduce Detectron v1 training if 22 | # desired. 23 | BIAS_LR_FACTOR: 1.0 24 | WEIGHT_DECAY_BIAS: ${.WEIGHT_DECAY} 25 | 26 | GAMMA: 0.1 27 | 28 | # See detectron2/solver/build.py for LR scheduler options 29 | LR_SCHEDULER_NAME: WarmupMultiStepLR 30 | # The iteration number to decrease learning rate by GAMMA. 31 | STEPS: [30000] 32 | 33 | WARMUP_FACTOR: 0.0001 34 | WARMUP_ITERS: 2000 35 | WARMUP_METHOD: "linear" 36 | 37 | # Gradient clipping 38 | CLIP_GRADIENTS: 39 | ENABLED: False 40 | # Type of gradient clipping, currently 2 values are supported: 41 | # - "value": the absolute values of elements of each gradients are clipped 42 | # - "norm": the norm of the gradient for each parameter is clipped thus 43 | # affecting all elements in the parameter 44 | CLIP_TYPE: "value" 45 | # Maximum absolute value used for clipping gradients 46 | CLIP_VALUE: 1.0 47 | # Floating point number p for L-p norm to be used with the "norm" 48 | # gradient clipping type; for L-inf, please specify .inf 49 | NORM_TYPE: 2.0 50 | 51 | # Save a checkpoint after every this number of iterations 52 | CHECKPOINT_PERIOD: 5000 53 | 54 | # Support mixed precision training. 55 | MIXED_PRECISION_ENABLED: False 56 | 57 | # If any parameters might not be used in forward pass, turn on this to avoid error in DDP. 58 | # See "Internal Design" -> "Forward Pass": https://pytorch.org/docs/stable/notes/ddp.html 59 | DDP_FIND_UNUSED_PARAMETERS: False 60 | 61 | # Run multiple batches of size IMS_PER_BATCH before doing a backward pass. 62 | # The effective batch size: IMS_PER_BATCH x ACCUMULATE_GRAD_BATCHES 63 | ACCUMULATE_GRAD_BATCHES: 1 64 | 65 | # If True, then SyncBN use only workers of the same machine to compute batch stats used in batchnorm. 66 | # If False, then SyncBN uses all workers across all machines. 67 | SYNCBN_USE_LOCAL_WORKERS: False 68 | -------------------------------------------------------------------------------- /configs/common/test.yaml: -------------------------------------------------------------------------------- 1 | ENABLED: True 2 | 3 | # The period (in terms of steps) to evaluate the model during training. 4 | EVAL_PERIOD: 1000 5 | EVAL_ON_START: False 6 | ADDITIONAL_EVAL_STEPS: [] 7 | 8 | # (dennis.park) detectron2 hardcodes # ims per gpu to 1. 9 | IMS_PER_BATCH: 16 10 | -------------------------------------------------------------------------------- /configs/common/test_dataloader.yaml: -------------------------------------------------------------------------------- 1 | # Number of data loading threads 2 | NUM_WORKERS: 4 3 | 4 | # (dennis.park) Options: InferenceSampler, InferenceGroupSampler 5 | # If using `InferenceGroupSampler`, the user must specify `NUM_IMAGES_PER_GROUP` somewhere else. 6 | SAMPLER: "InferenceSampler" 7 | -------------------------------------------------------------------------------- /configs/common/train_dataloader.yaml: -------------------------------------------------------------------------------- 1 | # Number of data loading threads 2 | NUM_WORKERS: 4 3 | 4 | FILTER_EMPTY_ANNOTATIONS: True 5 | 6 | # Options: TrainingSampler, RepeatFactorTrainingSampler 7 | SAMPLER: TrainingSampler 8 | # Repeat threshold for RepeatFactorTrainingSampler 9 | REPEAT_THRESHOLD: 0.4 10 | 11 | # If True, each batch should contain only images for which the aspect ratio 12 | # is compatible. This groups portrait images together, and landscape images 13 | # are not batched with portrait images. 14 | # NOTE (dennis.park): This is set to True in detectron2. 15 | ASPECT_RATIO_GROUPING: False 16 | -------------------------------------------------------------------------------- /configs/defaults.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - train_datasets@DATASETS.TRAIN: 4 | - test_datasets@DATASETS.TEST: 5 | - feature_extractors@FE: 6 | - meta_arch@: 7 | - common/train_dataloader@DATALOADER.TRAIN 8 | - common/test_dataloader@DATALOADER.TEST 9 | - common/augmentation@INPUT 10 | - common/optimizer@SOLVER 11 | - common/test@TEST 12 | 13 | WANDB: 14 | ENABLED: False 15 | # If True, then it will not upload to the W&B server. 16 | DRYRUN: False 17 | PROJECT: dd3d 18 | GROUP: 19 | TAGS: [] 20 | 21 | EVAL_ONLY: False 22 | EVAL_ON_START: False 23 | 24 | ONLY_REGISTER_DATASETS: False 25 | 26 | OUTPUT_ROOT: './outputs' 27 | 28 | SYNC_OUTPUT_DIR_S3: 29 | ENABLED: False 30 | # The root path in S3 to cache working directories. Must start with 's3://' 31 | ROOT_IN_S3: ??? 32 | # How frequently (in training steps) to sync the working directory. 33 | PERIOD: 1000 34 | 35 | DATASET_ROOT: /data/datasets/ 36 | TMP_DIR: /tmp/ 37 | 38 | hydra: 39 | callbacks: 40 | distributed_callback: 41 | _target_: tridet.utils.hydra.callbacks.SetupDistributedCallback 42 | wandb_callback: 43 | _target_: tridet.utils.hydra.callbacks.WandbInitCallback 44 | output_dir_callback: 45 | _target_: tridet.utils.hydra.callbacks.SyncOutputDirCallback 46 | d2_logger_callback: 47 | _target_: tridet.utils.hydra.callbacks.D2LoggerCallback 48 | ckpt_path_callback: 49 | _target_: tridet.utils.hydra.callbacks.CkptPathResolverCallback 50 | sync_output_s3_end_callback: 51 | _target_: tridet.utils.hydra.callbacks.SyncOutputS3BeforeEnd 52 | verbose: False 53 | -------------------------------------------------------------------------------- /configs/evaluators/kitti_3d.yaml: -------------------------------------------------------------------------------- 1 | # ----------------------------------- 2 | # KITTI3D evaluator (3D bounding box) 3 | # ----------------------------------- 4 | IOU_THRESHOLDS: [0.5, 0.7] 5 | ONLY_PREPARE_SUBMISSION: False 6 | -------------------------------------------------------------------------------- /configs/evaluators/nuscenes.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TRI-ML/dd3d/56fad8ec9eb7fbd373953f49f9372120b4cd740c/configs/evaluators/nuscenes.yaml -------------------------------------------------------------------------------- /configs/experiments/dd3d_kitti_dla34.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/kitti_3d@EVALUATORS.KITTI3D 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: kitti_3d 6 | - override /test_datasets@DATASETS.TEST: kitti_3d 7 | - override /feature_extractors@FE: dla34_fpn 8 | 9 | MODEL: 10 | # from-coco, IODA-pretrained. 11 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_dla34-y1urdmir-20210422_165446-model_final-remapped.pth 12 | 13 | FE: 14 | BACKBONE: 15 | NORM: FrozenBN 16 | FPN: 17 | NORM: FrozenBN 18 | OUT_FEATURES: ${.FPN.OUT_FEATURES} 19 | 20 | DD3D: 21 | FCOS2D: 22 | NORM: BN 23 | INFERENCE: 24 | NMS_THRESH: 0.75 25 | 26 | FCOS3D: 27 | NORM: FrozenBN 28 | 29 | INPUT: 30 | RESIZE: 31 | # KITTI images are (370, 1224) 32 | MIN_SIZE_TRAIN: [288, 304, 320, 336, 352, 368, 384, 400, 416, 448, 480, 512, 544, 576] 33 | MAX_SIZE_TRAIN: 10000 34 | MIN_SIZE_TEST: 384 35 | MAX_SIZE_TEST: 100000 36 | 37 | SOLVER: 38 | IMS_PER_BATCH: 64 # need at least 128 GPU mem (with fp16). 39 | BASE_LR: 0.002 40 | MAX_ITER: 25000 41 | STEPS: [21500, 24000] 42 | WARMUP_ITERS: 2000 43 | MIXED_PRECISION_ENABLED: True 44 | CHECKPOINT_PERIOD: 2000 45 | 46 | TEST: 47 | IMS_PER_BATCH: 80 48 | EVAL_PERIOD: 2000 49 | AUG: 50 | ENABLED: True 51 | MIN_SIZES: [320, 384, 448, 512, 576] 52 | MAX_SIZE: 100000 53 | FLIP: True 54 | 55 | DATALOADER: 56 | TRAIN: 57 | NUM_WORKERS: 12 58 | SAMPLER: RepeatFactorTrainingSampler 59 | REPEAT_THRESHOLD: 0.4 60 | 61 | WANDB: 62 | TAGS: [kitti-val, dla34, bn] 63 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_kitti_dla34_overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - dd3d_kitti_dla34 4 | 5 | DATASETS: 6 | TRAIN: 7 | NAME: kitti_3d_overfit 8 | TEST: 9 | NAME: kitti_3d_overfit 10 | 11 | INPUT: 12 | AUG_ENABLED: False 13 | 14 | SOLVER: 15 | IMS_PER_BATCH: 8 16 | BASE_LR: 0.0001 17 | MAX_ITER: 1500 18 | STEPS: [1200] 19 | WARMUP_ITERS: 100 20 | 21 | TEST: 22 | EVAL_PERIOD: 500 23 | AUG: 24 | ENABLED: False 25 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_kitti_omninets.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/kitti_3d@EVALUATORS.KITTI3D 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: kitti_3d 6 | - override /test_datasets@DATASETS.TEST: kitti_3d 7 | - override /feature_extractors@FE: dla34_fpn 8 | 9 | MODEL: 10 | # from-coco, IODA-pretrained. 11 | backbone_with_fpn: 12 | width_mult: 1.0 13 | depth_mult: 1.0 14 | 15 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_omninet-small-3nxjur71.pth 16 | 17 | FE: 18 | BACKBONE: 19 | NORM: FrozenBN 20 | FPN: 21 | NORM: FrozenBN 22 | OUT_FEATURES: ${.FPN.OUT_FEATURES} 23 | 24 | DD3D: 25 | FCOS2D: 26 | NORM: BN 27 | INFERENCE: 28 | NMS_THRESH: 0.75 29 | 30 | FCOS3D: 31 | NORM: FrozenBN 32 | 33 | INPUT: 34 | RESIZE: 35 | # KITTI images are (370, 1224) 36 | MIN_SIZE_TRAIN: [288, 304, 320, 336, 352, 368, 384, 400, 416, 448, 480, 512, 544, 576] 37 | MAX_SIZE_TRAIN: 10000 38 | MIN_SIZE_TEST: 384 39 | MAX_SIZE_TEST: 100000 40 | 41 | SOLVER: 42 | IMS_PER_BATCH: 64 # need at least 128 GPU mem (with fp16). 43 | BASE_LR: 0.002 44 | MAX_ITER: 25000 45 | STEPS: [21500, 24000] 46 | WARMUP_ITERS: 2000 47 | MIXED_PRECISION_ENABLED: True 48 | CHECKPOINT_PERIOD: 2000 49 | 50 | TEST: 51 | IMS_PER_BATCH: 80 52 | EVAL_PERIOD: 2000 53 | AUG: 54 | ENABLED: True 55 | MIN_SIZES: [320, 384, 448, 512, 576] 56 | MAX_SIZE: 100000 57 | FLIP: True 58 | 59 | DATALOADER: 60 | TRAIN: 61 | NUM_WORKERS: 12 62 | SAMPLER: RepeatFactorTrainingSampler 63 | REPEAT_THRESHOLD: 0.4 64 | 65 | WANDB: 66 | TAGS: [kitti-val, dla34, bn] 67 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_kitti_regnety_006_bifpn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/kitti_3d@EVALUATORS.KITTI3D 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: kitti_3d 6 | - override /test_datasets@DATASETS.TEST: kitti_3d 7 | - override /feature_extractors@FE: regnety_006_bifpn 8 | 9 | MODEL: 10 | # from-coco, IODA-pretrained. 11 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_dla34-y1urdmir-20210422_165446-model_final-remapped.pth 12 | 13 | FE: 14 | NORM: FrozenBN 15 | FPN: 16 | NORM: FrozenBN 17 | 18 | DD3D: 19 | FCOS2D: 20 | NORM: BN 21 | INFERENCE: 22 | NMS_THRESH: 0.75 23 | 24 | FCOS3D: 25 | NORM: FrozenBN 26 | 27 | INPUT: 28 | RESIZE: 29 | # KITTI images are (370, 1224) 30 | MIN_SIZE_TRAIN: [288, 304, 320, 336, 352, 368, 384, 400, 416, 448, 480, 512, 544, 576] 31 | MAX_SIZE_TRAIN: 10000 32 | MIN_SIZE_TEST: 384 33 | MAX_SIZE_TEST: 100000 34 | 35 | SOLVER: 36 | IMS_PER_BATCH: 64 37 | BASE_LR: 0.002 38 | MAX_ITER: 25000 39 | STEPS: [21500, 24000] 40 | WARMUP_ITERS: 2000 41 | MIXED_PRECISION_ENABLED: True 42 | CHECKPOINT_PERIOD: 2000 43 | 44 | TEST: 45 | IMS_PER_BATCH: 80 46 | EVAL_PERIOD: 2000 47 | AUG: 48 | ENABLED: True 49 | MIN_SIZES: [320, 384, 448, 512, 576] 50 | MAX_SIZE: 100000 51 | FLIP: True 52 | 53 | DATALOADER: 54 | TRAIN: 55 | NUM_WORKERS: 12 56 | SAMPLER: RepeatFactorTrainingSampler 57 | REPEAT_THRESHOLD: 0.4 58 | 59 | WANDB: 60 | TAGS: [kitti-val, dla34, bn] 61 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_kitti_v99.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/kitti_3d@EVALUATORS.KITTI3D 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: kitti_3d 6 | - override /test_datasets@DATASETS.TEST: kitti_3d 7 | - override /feature_extractors@FE: v2_99_fpn 8 | 9 | MODEL: 10 | # from-coco, IODA-pretrained. 11 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_v99-3jlw0p36-20210423_010520-model_final-remapped.pth 12 | 13 | FE: 14 | BACKBONE: 15 | NORM: FrozenBN 16 | FPN: 17 | NORM: FrozenBN 18 | OUT_FEATURES: ${.FPN.OUT_FEATURES} 19 | 20 | DD3D: 21 | FCOS2D: 22 | NORM: BN 23 | INFERENCE: 24 | NMS_THRESH: 0.75 25 | 26 | FCOS3D: 27 | NORM: FrozenBN 28 | 29 | INPUT: 30 | RESIZE: 31 | # KITTI images are (370, 1224) 32 | MIN_SIZE_TRAIN: [288, 304, 320, 336, 352, 368, 384, 400, 416, 448, 480, 512, 544, 576] 33 | MAX_SIZE_TRAIN: 10000 34 | MIN_SIZE_TEST: 384 35 | MAX_SIZE_TEST: 100000 36 | 37 | SOLVER: 38 | IMS_PER_BATCH: 64 # need at least 256 GPU mem (with fp16). 39 | BASE_LR: 0.002 40 | MAX_ITER: 25000 41 | STEPS: [21500, 24000] 42 | WARMUP_ITERS: 2000 # ~35 epochs 43 | MIXED_PRECISION_ENABLED: True 44 | CHECKPOINT_PERIOD: 2000 45 | 46 | TEST: 47 | IMS_PER_BATCH: 80 48 | EVAL_PERIOD: 2000 49 | AUG: 50 | ENABLED: True 51 | MIN_SIZES: [320, 384, 448, 512, 576] 52 | MAX_SIZE: 100000 53 | FLIP: True 54 | 55 | DATALOADER: 56 | TRAIN: 57 | NUM_WORKERS: 12 58 | SAMPLER: RepeatFactorTrainingSampler 59 | REPEAT_THRESHOLD: 0.4 60 | 61 | WANDB: 62 | TAGS: [kitti-val, v2-99, bn] 63 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_nusc_dla34.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/nuscenes@EVALUATORS.NUSCENES 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: nuscenes 6 | - override /test_datasets@DATASETS.TEST: nuscenes 7 | - override /feature_extractors@FE: dla34_fpn 8 | 9 | MODEL: 10 | META_ARCHITECTURE: NuscenesDD3D 11 | # from-coco, IODA-pretrained. 12 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_dla34-y1urdmir-20210422_165446-model_final-remapped.pth 13 | 14 | FE: 15 | BACKBONE: 16 | NORM: FrozenBN 17 | FPN: 18 | NORM: FrozenBN 19 | OUT_FEATURES: ${.FPN.OUT_FEATURES} 20 | 21 | DD3D: 22 | FCOS2D: 23 | NORM: BN 24 | INFERENCE: 25 | NMS_THRESH: 0.75 26 | 27 | FCOS3D: 28 | NORM: FrozenBN 29 | 30 | NUSC: 31 | LOSS: 32 | WEIGHT_ATTR: 0.2 33 | WEIGHT_SPEED: 0.2 34 | INFERENCE: 35 | NUM_IMAGES_PER_SAMPLE: ${DATASETS.TEST.NUM_IMAGES_PER_SAMPLE} 36 | MAX_NUM_DETS_PER_SAMPLE: 500 37 | 38 | INPUT: 39 | RESIZE: 40 | # Nuscens images are (900, 1600) 41 | MIN_SIZE_TRAIN: [640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152] 42 | MAX_SIZE_TRAIN: 10000 43 | MIN_SIZE_TEST: 896 44 | MAX_SIZE_TEST: 100000 45 | 46 | SOLVER: 47 | IMS_PER_BATCH: 64 # need at least 128 GPU mem (with fp16). 48 | BASE_LR: 0.002 49 | MAX_ITER: 120000 50 | STEPS: [100000, 115000] 51 | WARMUP_ITERS: 2000 52 | MIXED_PRECISION_ENABLED: True 53 | CHECKPOINT_PERIOD: 2000 54 | 55 | TEST: 56 | IMS_PER_BATCH: 96 # 6 * 16 (must be multiple of 6 x #GPUs.) 57 | EVAL_PERIOD: 2000 58 | AUG: 59 | ENABLED: True 60 | MIN_SIZES: [640, 768, 896, 1024, 1152] 61 | MAX_SIZE: 100000 62 | FLIP: True 63 | 64 | DATALOADER: 65 | TRAIN: 66 | NUM_WORKERS: 12 67 | SAMPLER: RepeatFactorTrainingSampler 68 | REPEAT_THRESHOLD: 0.8 69 | TEST: 70 | SAMPLER: InferenceGroupSampler 71 | NUM_IMAGES_PER_GROUP: ${DATASETS.TEST.NUM_IMAGES_PER_SAMPLE} 72 | 73 | WANDB: 74 | TAGS: [nusc-val, v2-99, bn] 75 | -------------------------------------------------------------------------------- /configs/experiments/dd3d_nusc_v99.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - /evaluators/nuscenes@EVALUATORS.NUSCENES 4 | - override /meta_arch: dd3d 5 | - override /train_datasets@DATASETS.TRAIN: nuscenes 6 | - override /test_datasets@DATASETS.TEST: nuscenes 7 | - override /feature_extractors@FE: v2_99_fpn 8 | 9 | MODEL: 10 | META_ARCHITECTURE: NuscenesDD3D 11 | # from-coco, IODA-pretrained. 12 | CKPT: https://tri-ml-public.s3.amazonaws.com/github/dd3d/pretrained/depth_pretrained_v99-3jlw0p36-20210423_010520-model_final-remapped.pth 13 | 14 | FE: 15 | BACKBONE: 16 | NORM: FrozenBN 17 | FPN: 18 | NORM: FrozenBN 19 | OUT_FEATURES: ${.FPN.OUT_FEATURES} 20 | 21 | DD3D: 22 | FCOS2D: 23 | NORM: BN 24 | INFERENCE: 25 | NMS_THRESH: 0.75 26 | 27 | FCOS3D: 28 | NORM: FrozenBN 29 | 30 | NUSC: 31 | LOSS: 32 | WEIGHT_ATTR: 0.2 33 | WEIGHT_SPEED: 0.2 34 | INFERENCE: 35 | NUM_IMAGES_PER_SAMPLE: ${DATASETS.TEST.NUM_IMAGES_PER_SAMPLE} 36 | MAX_NUM_DETS_PER_SAMPLE: 500 37 | 38 | INPUT: 39 | RESIZE: 40 | # Nuscens images are (900, 1600) 41 | MIN_SIZE_TRAIN: [640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152] 42 | MAX_SIZE_TRAIN: 10000 43 | MIN_SIZE_TEST: 896 44 | MAX_SIZE_TEST: 100000 45 | 46 | SOLVER: 47 | IMS_PER_BATCH: 64 # need at least 400 GPU mem (with fp16). 48 | BASE_LR: 0.002 49 | MAX_ITER: 120000 50 | STEPS: [100000, 115000] 51 | WARMUP_ITERS: 2000 52 | MIXED_PRECISION_ENABLED: True 53 | CHECKPOINT_PERIOD: 2000 54 | 55 | TEST: 56 | IMS_PER_BATCH: 192 # 6 * 32 (must be multiple of 6 x #GPUs.) 57 | EVAL_PERIOD: 2000 58 | AUG: 59 | ENABLED: True 60 | MIN_SIZES: [640, 768, 896, 1024, 1152] 61 | MAX_SIZE: 100000 62 | FLIP: True 63 | 64 | DATALOADER: 65 | TRAIN: 66 | NUM_WORKERS: 12 67 | SAMPLER: RepeatFactorTrainingSampler 68 | REPEAT_THRESHOLD: 0.8 69 | TEST: 70 | SAMPLER: InferenceGroupSampler 71 | NUM_IMAGES_PER_GROUP: ${DATASETS.TEST.NUM_IMAGES_PER_SAMPLE} 72 | 73 | WANDB: 74 | TAGS: [nusc-val, v2-99, bn] 75 | -------------------------------------------------------------------------------- /configs/feature_extractors/d2_fpn.yaml: -------------------------------------------------------------------------------- 1 | IN_FEATURES: ${..BACKBONE.OUT_FEATURES} 2 | # By default ('None'), returns all features. 3 | OUT_FEATURES: 4 | 5 | OUT_CHANNELS: 256 6 | NORM: BN 7 | 8 | # Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg" 9 | FUSE_TYPE: sum 10 | -------------------------------------------------------------------------------- /configs/feature_extractors/dla34_fpn.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - d2_fpn@FPN 3 | 4 | BUILDER: build_fcos_dla_fpn_backbone_p67 5 | 6 | BACKBONE: 7 | NAME: DLA-34 8 | OUT_FEATURES: [level3, level4, level5] 9 | NORM: BN 10 | -------------------------------------------------------------------------------- /configs/feature_extractors/omninet_big.yaml: -------------------------------------------------------------------------------- 1 | _target_: tridet.modeling.backbone.omni_scripts.backbone_with_fpn.build_feature_extractor_all_fuse 2 | 3 | return_list: True 4 | width_mult: 1.3 5 | depth_mult: 1.0 6 | -------------------------------------------------------------------------------- /configs/feature_extractors/omninet_small.yaml: -------------------------------------------------------------------------------- 1 | _target_: tridet.modeling.backbone.omni_scripts.backbone_with_fpn.build_feature_extractor_all_fuse 2 | 3 | return_list: True 4 | width_mult: 1.0 5 | depth_mult: 1.0 6 | -------------------------------------------------------------------------------- /configs/feature_extractors/v2_99_fpn.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - d2_fpn@FPN 3 | 4 | BUILDER: build_fcos_vovnet_fpn_backbone_p6 5 | 6 | BACKBONE: 7 | NAME: V-99-eSE 8 | OUT_FEATURES: [stage2, stage3, stage4, stage5] 9 | NORM: BN 10 | -------------------------------------------------------------------------------- /configs/meta_arch/dd3d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - /models/dd3d@DD3D 3 | - /visualizers/common@VIS 4 | - /visualizers/d2@VIS.D2 5 | - /visualizers/box3d@VIS.BOX3D 6 | 7 | INPUT: 8 | FORMAT: BGR 9 | 10 | MODEL: 11 | DEVICE: cuda 12 | META_ARCHITECTURE: DD3D 13 | 14 | # Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR). 15 | # To train on images of different number of channels, just set different mean & std. 16 | # Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] 17 | PIXEL_MEAN: [103.530, 116.280, 123.675] 18 | # NOTE (dennis.park): This is set to [1.0, 1.0, 1.0] in detectron2. 19 | PIXEL_STD: [57.375, 57.120, 58.395] 20 | 21 | # Path (a file path, or S3 URL like s3://... ) to a checkpoint file 22 | # to be loaded to the model. 23 | CKPT: "" 24 | 25 | BOX2D_ON: True 26 | BOX3D_ON: True 27 | DEPTH_ON: False 28 | -------------------------------------------------------------------------------- /configs/models/dd3d.yaml: -------------------------------------------------------------------------------- 1 | IN_FEATURES: ${FE.OUT_FEATURES} 2 | 3 | NUM_CLASSES: ${DATASETS.TRAIN.NUM_CLASSES} 4 | 5 | # If None, then the feature location starts from (0, 0) 6 | # If "half", then it starts from the (stride / 2, stride / 2) 7 | FEATURE_LOCATIONS_OFFSET: none # "none" or "half" 8 | 9 | # Range of sizes that each FPN level is responsible for. 10 | SIZES_OF_INTEREST: [64, 128, 256, 512] 11 | 12 | INFERENCE: 13 | DO_NMS: True # 2D NMS 14 | DO_POSTPROCESS: True # Resize instances according to the original image size. 15 | DO_BEV_NMS: False # NMS in BEV space. 16 | BEV_NMS_IOU_THRESH: 0.3 17 | NUSC_SAMPLE_AGGREGATE: False 18 | 19 | FCOS2D: 20 | _VERSION: v2 21 | NORM: BN 22 | NUM_CLS_CONVS: 4 23 | NUM_BOX_CONVS: 4 24 | USE_DEFORMABLE: False 25 | USE_SCALE: True 26 | BOX2D_SCALE_INIT_FACTOR: 1.0 27 | 28 | LOSS: 29 | ALPHA: 0.25 30 | GAMMA: 2.0 31 | LOC_LOSS_TYPE: giou 32 | 33 | INFERENCE: 34 | THRESH_WITH_CTR: True 35 | PRE_NMS_THRESH: 0.05 36 | PRE_NMS_TOPK: 1000 37 | POST_NMS_TOPK: 100 38 | NMS_THRESH: 0.6 39 | 40 | FCOS3D: 41 | NORM: BN 42 | NUM_CONVS: 4 43 | USE_DEFORMABLE: False 44 | USE_SCALE: True 45 | DEPTH_SCALE_INIT_FACTOR: 0.3 46 | PROJ_CTR_SCALE_INIT_FACTOR: 1.0 47 | PER_LEVEL_PREDICTORS: False 48 | 49 | # If True, then the depth prediction is scaled using focal lengths; this enables camera-awareness. 50 | SCALE_DEPTH_BY_FOCAL_LENGTHS: True 51 | SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR: 500. 52 | 53 | MEAN_DEPTH_PER_LEVEL: ${DATASETS.TRAIN.MEAN_DEPTH_PER_LEVEL} 54 | STD_DEPTH_PER_LEVEL: ${DATASETS.TRAIN.STD_DEPTH_PER_LEVEL} 55 | 56 | MIN_DEPTH: 0.1 57 | MAX_DEPTH: 80.0 58 | 59 | CANONICAL_BOX3D_SIZES: ${DATASETS.TRAIN.CANONICAL_BOX3D_SIZES} 60 | CLASS_AGNOSTIC_BOX3D: False 61 | 62 | # If True, then the network predicts allocentric (local) orientation. 63 | PREDICT_ALLOCENTRIC_ROT: True 64 | # If True, then the network predicts L2 distance between camera and box center; if False, then it predicts the z-value. 65 | PREDICT_DISTANCE: False 66 | 67 | LOSS: 68 | SMOOTH_L1_BETA: 0.05 69 | MAX_LOSS_PER_GROUP_DISENT: 20.0 70 | CONF_3D_TEMPERATURE: 1.0 71 | 72 | WEIGHT_BOX3D: 2.0 73 | WEIGHT_CONF3D: 1.0 74 | 75 | PREPARE_TARGET: 76 | CENTER_SAMPLE: True 77 | POS_RADIUS: 1.5 78 | -------------------------------------------------------------------------------- /configs/models/depth_head.yaml: -------------------------------------------------------------------------------- 1 | _target_: tridet.modeling.dd3d.depth.PacknetDepthHead 2 | _partial_: True # must provide 'input_shape 3 | 4 | net: 5 | _target_: tridet.layers.ConvBnFpnLayers 6 | _partial_: True # must provide 'input_shape'. 7 | 8 | num_layers: 4 9 | norm_kwargs: 10 | kernel_size: 3 11 | activation: 'gelu' 12 | groups: 1 13 | 14 | min_depth: 1.0 15 | max_depth: 80.0 16 | scale_depth_by_focal_length: 900.0 17 | -------------------------------------------------------------------------------- /configs/test_datasets/base_test_dataset.yaml: -------------------------------------------------------------------------------- 1 | NAME: ??? 2 | 3 | NUSC_SAMPLE_AGGREGATE_IN_INFERENCE: False 4 | 5 | DATASET_MAPPER: "default" 6 | -------------------------------------------------------------------------------- /configs/test_datasets/kitti_3d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_test_dataset 3 | 4 | NAME: kitti_3d_val 5 | -------------------------------------------------------------------------------- /configs/test_datasets/nuscenes.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_test_dataset 3 | 4 | NAME: nusc_val-subsample-8 5 | 6 | NUSC_SAMPLE_AGGREGATE_IN_INFERENCE: True 7 | NUM_IMAGES_PER_SAMPLE: 6 8 | -------------------------------------------------------------------------------- /configs/train_datasets/base_train_dataset.yaml: -------------------------------------------------------------------------------- 1 | NAME: ??? 2 | 3 | CANONICAL_BOX3D_SIZES: ??? 4 | 5 | DATASET_MAPPER: "default" 6 | -------------------------------------------------------------------------------- /configs/train_datasets/kitti_3d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_train_dataset 3 | 4 | NAME: kitti_3d_train 5 | 6 | CANONICAL_BOX3D_SIZES: [ 7 | # (width, length, height) 8 | [1.61876949, 3.89154523, 1.52969237], # Car 9 | [0.62806586, 0.82038497, 1.76784787], # Pedestrian 10 | [0.56898187, 1.77149234, 1.7237099], # Cyclist 11 | [1.9134491 , 5.15499603, 2.18998422], # Van 12 | [2.61168401, 9.22692319, 3.36492722], # Truck 13 | [0.5390196 , 1.08098042, 1.28392158], # Person_sitting 14 | [2.36044838, 15.56991038, 3.5289238], # Tram 15 | [1.24489164, 2.51495357, 1.61402478], # Misc 16 | ] 17 | 18 | NUM_CLASSES: 5 19 | 20 | MEAN_DEPTH_PER_LEVEL: [32.594, 15.178, 8.424, 5.004, 4.662] 21 | STD_DEPTH_PER_LEVEL: [14.682, 7.139, 4.345, 2.399, 2.587] 22 | -------------------------------------------------------------------------------- /configs/train_datasets/nuscenes.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_train_dataset 3 | 4 | NAME: nusc_train 5 | 6 | CANONICAL_BOX3D_SIZES: [ 7 | # (width, length, height) 8 | [2.3524184, 0.5062202, 1.0413622], # barrier 9 | [0.61416006, 1.7016163, 1.3054738], # bicycle 10 | [2.9139307, 10.725025, 3.2832346], # bus 11 | [1.9751819, 4.641267, 1.74352], # car 12 | [2.772134, 6.565072, 3.2474296], # construction vehicle 13 | [0.7800532, 2.138673, 1.4437162], # motorcycle 14 | [0.6667362, 0.7181772, 1.7616143], # pedestrian 15 | [0.40246472, 0.4027083, 1.0084083], # traffic cone 16 | [3.0059454, 12.8197, 4.1213827], # trailer 17 | [2.4986045, 6.9310856, 2.8382742] # truck 18 | ] 19 | 20 | NUM_CLASSES: 10 21 | 22 | MEAN_DEPTH_PER_LEVEL: [44.921, 20.252, 11.712, 7.166, 8.548] 23 | STD_DEPTH_PER_LEVEL: [24.331, 9.833, 6.223, 4.611, 8.275] 24 | 25 | DATASET_MAPPER: "nuscenes" 26 | 27 | MIN_NUM_LIDAR_PTS: 3 28 | MIN_BOX_VISIBILITY: 0.2 29 | -------------------------------------------------------------------------------- /configs/visualize_dataloader.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - defaults 3 | 4 | USE_TEST: False 5 | 6 | MODEL: 7 | CHECKPOINT: '' 8 | 9 | WANDB: 10 | ENABLED: False 11 | 12 | SOLVER: 13 | IMS_PER_BATCH: 4 14 | TEST: 15 | IMS_PER_BATCH: 4 16 | -------------------------------------------------------------------------------- /configs/visualizers/base_visualizer.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | ENABLED: True 3 | 4 | PREDICTIONS: 5 | ENABLED: True 6 | -------------------------------------------------------------------------------- /configs/visualizers/box3d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_visualizer 3 | 4 | DATALOADER: 5 | SCALE: 1.0 6 | RENDER_LABELS: True 7 | 8 | PREDICTIONS: 9 | SCALE: 1.0 10 | RENDER_LABELS: True 11 | THRESHOLD: 0.5 12 | MIN_DEPTH_CENTER: 0. 13 | -------------------------------------------------------------------------------- /configs/visualizers/common.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER_ENABLED: True 2 | DATALOADER_PERIOD: 1000 3 | DATALOADER_MAX_NUM_SAMPLES: 10 4 | 5 | PREDICTIONS_ENABLED: True 6 | PREDICTIONS_MAX_NUM_SAMPLES: 20 7 | -------------------------------------------------------------------------------- /configs/visualizers/d2.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - base_visualizer 3 | 4 | DATALOADER: 5 | SCALE: 1.0 6 | COLOR_MODE: "image" 7 | 8 | PREDICTIONS: 9 | SCALE: 1.0 10 | COLOR_MODE: "image" 11 | THRESHOLD: 0.4 12 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 2 | 3 | ENV PYTHON_VERSION=3.8 4 | 5 | # ------------------------- 6 | # Optional: AWS credentials 7 | # ------------------------- 8 | ARG AWS_SECRET_ACCESS_KEY 9 | ENV AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 10 | 11 | ARG AWS_ACCESS_KEY_ID 12 | ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 13 | 14 | ARG AWS_DEFAULT_REGION 15 | ENV AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION} 16 | 17 | # ------------------------- 18 | # Optional: W&B credentials 19 | # ------------------------- 20 | ARG WANDB_ENTITY 21 | ENV WANDB_ENTITY=${WANDB_ENTITY} 22 | 23 | ARG WANDB_API_KEY 24 | ENV WANDB_API_KEY=${WANDB_API_KEY} 25 | 26 | # ------------------------- 27 | # Install core APT packages. 28 | # ------------------------- 29 | ENV DEBIAN_FRONTEND=noninteractive 30 | RUN apt-get update && apt-get install -y \ 31 | # essential 32 | build-essential \ 33 | cmake \ 34 | ffmpeg \ 35 | g++-4.8 \ 36 | git \ 37 | curl \ 38 | docker.io \ 39 | vim \ 40 | wget \ 41 | unzip \ 42 | htop \ 43 | libjpeg-dev \ 44 | libpng-dev \ 45 | libavdevice-dev \ 46 | pkg-config \ 47 | # python 48 | python${PYTHON_VERSION} \ 49 | python${PYTHON_VERSION}-dev \ 50 | python3-tk \ 51 | python${PYTHON_VERSION}-distutils \ 52 | # opencv 53 | python3-opencv \ 54 | # set python 55 | && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \ 56 | && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 \ 57 | && rm -rf /var/lib/apt/lists/* 58 | 59 | # -------------------------------------------------- 60 | # We use 'mpirun' for launching distributed training. 61 | # -------------------------------------------------- 62 | RUN mkdir /tmp/openmpi && \ 63 | cd /tmp/openmpi && \ 64 | wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz && \ 65 | tar zxf openmpi-4.1.1.tar.gz && \ 66 | cd openmpi-4.1.1 && \ 67 | ./configure --enable-orterun-prefix-by-default && \ 68 | make -j $(nproc) all && \ 69 | make install && \ 70 | ldconfig && \ 71 | rm -rf /tmp/openmpi 72 | 73 | # Install OpenSSH for MPI to communicate between containers 74 | RUN apt-get update && apt-get install -y --no-install-recommends openssh-client openssh-server && \ 75 | mkdir -p /var/run/sshd 76 | 77 | # Allow OpenSSH to talk to containers without asking for confirmation 78 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 79 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 80 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 81 | 82 | # ------------------------- 83 | # Install core PIP packages. 84 | # ------------------------- 85 | # Upgrade pip. 86 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 87 | python get-pip.py && \ 88 | rm get-pip.py 89 | 90 | # Core tools. 91 | RUN pip install \ 92 | awscli==1.20.27 \ 93 | boto3==1.18.27 \ 94 | coloredlogs==15.0.1 \ 95 | hydra-core==1.1.1 \ 96 | matplotlib==3.4.3 \ 97 | mpi4py==3.1.1 \ 98 | numpy==1.20.3 \ 99 | pandas==1.3.2 \ 100 | requests==2.26.0 \ 101 | scikit-image==0.18.2 \ 102 | scipy==1.7.1 \ 103 | seaborn==0.11.2 \ 104 | tenacity==8.0.1 \ 105 | tqdm==4.62.2 \ 106 | wandb==0.12.0 107 | 108 | RUN pip install numba==0.54.0 Cython==0.29.24 pycocotools==2.0.2 nuscenes-devkit==1.1.7 109 | 110 | # Install pytorch 1.9+cu102 111 | RUN pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 -f https://download.pytorch.org/whl/torch_stable.html 112 | 113 | # Install fvcore and detectron2. 114 | ENV FVCORE_CACHE="/tmp" 115 | RUN pip install -U 'git+https://github.com/facebookresearch/fvcore' 116 | RUN python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html 117 | 118 | # Pre-built pytorch3d 119 | RUN pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py38_cu102_pyt190/download.html 120 | 121 | #----------------------- 122 | # Copy working directory 123 | #----------------------- 124 | ARG WORKSPACE 125 | COPY . ${WORKSPACE} 126 | 127 | ENV PYTHONPATH "${PYTHONPATH}:${WORKSPACE}/tridet/" 128 | 129 | WORKDIR ${WORKSPACE} 130 | -------------------------------------------------------------------------------- /docker/Dockerfile-cu111: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04 2 | 3 | ENV PYTHON_VERSION=3.8 4 | 5 | # ------------------------- 6 | # Optional: AWS credentials 7 | # ------------------------- 8 | ARG AWS_SECRET_ACCESS_KEY 9 | ENV AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 10 | 11 | ARG AWS_ACCESS_KEY_ID 12 | ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 13 | 14 | ARG AWS_DEFAULT_REGION 15 | ENV AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION} 16 | 17 | # ------------------------- 18 | # Optional: W&B credentials 19 | # ------------------------- 20 | ARG WANDB_ENTITY 21 | ENV WANDB_ENTITY=${WANDB_ENTITY} 22 | 23 | ARG WANDB_API_KEY 24 | ENV WANDB_API_KEY=${WANDB_API_KEY} 25 | 26 | # ------------------------- 27 | # Install core APT packages. 28 | # ------------------------- 29 | ENV DEBIAN_FRONTEND=noninteractive 30 | RUN apt-get update && apt-get install -y \ 31 | # essential 32 | build-essential \ 33 | cmake \ 34 | ffmpeg \ 35 | g++-4.8 \ 36 | git \ 37 | curl \ 38 | docker.io \ 39 | vim \ 40 | wget \ 41 | unzip \ 42 | htop \ 43 | libjpeg-dev \ 44 | libpng-dev \ 45 | libavdevice-dev \ 46 | pkg-config \ 47 | # python 48 | python${PYTHON_VERSION} \ 49 | python${PYTHON_VERSION}-dev \ 50 | python3-tk \ 51 | python${PYTHON_VERSION}-distutils \ 52 | # opencv 53 | python3-opencv \ 54 | # set python 55 | && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \ 56 | && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 \ 57 | && rm -rf /var/lib/apt/lists/* 58 | 59 | # -------------------------------------------------- 60 | # We use 'mpirun' for launching distributed training. 61 | # -------------------------------------------------- 62 | RUN mkdir /tmp/openmpi && \ 63 | cd /tmp/openmpi && \ 64 | wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz && \ 65 | tar zxf openmpi-4.1.1.tar.gz && \ 66 | cd openmpi-4.1.1 && \ 67 | ./configure --enable-orterun-prefix-by-default && \ 68 | make -j $(nproc) all && \ 69 | make install && \ 70 | ldconfig && \ 71 | rm -rf /tmp/openmpi 72 | 73 | # Install OpenSSH for MPI to communicate between containers 74 | RUN apt-get update && apt-get install -y --no-install-recommends openssh-client openssh-server && \ 75 | mkdir -p /var/run/sshd 76 | 77 | # Allow OpenSSH to talk to containers without asking for confirmation 78 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 79 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 80 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 81 | 82 | # ------------------------- 83 | # Install core PIP packages. 84 | # ------------------------- 85 | # Upgrade pip. 86 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 87 | python get-pip.py && \ 88 | rm get-pip.py 89 | 90 | # Core tools. 91 | RUN pip install \ 92 | awscli==1.20.27 \ 93 | boto3==1.18.27 \ 94 | coloredlogs==15.0.1 \ 95 | hydra-core==1.1.1 \ 96 | matplotlib==3.4.3 \ 97 | mpi4py==3.1.1 \ 98 | numpy==1.20.3 \ 99 | pandas==1.3.2 \ 100 | requests==2.26.0 \ 101 | scikit-image==0.18.2 \ 102 | scipy==1.7.1 \ 103 | seaborn==0.11.2 \ 104 | tenacity==8.0.1 \ 105 | tqdm==4.62.2 \ 106 | wandb==0.12.0 107 | 108 | RUN pip install numba==0.54.0 Cython==0.29.24 pycocotools==2.0.2 nuscenes-devkit==1.1.7 109 | 110 | # Install pytorch 1.9+cu111 111 | RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html 112 | 113 | # Install fvcore and detectron2. 114 | ENV FVCORE_CACHE="/tmp" 115 | RUN pip install -U 'git+https://github.com/facebookresearch/fvcore' 116 | RUN python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html 117 | 118 | # Pre-built pytorch3d 119 | RUN pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py38_cu111_pyt190/download.html 120 | 121 | #----------------------- 122 | # Copy working directory 123 | #----------------------- 124 | ARG WORKSPACE 125 | COPY . ${WORKSPACE} 126 | 127 | ENV PYTHONPATH "${PYTHONPATH}:${WORKSPACE}/tridet/" 128 | 129 | WORKDIR ${WORKSPACE} 130 | -------------------------------------------------------------------------------- /media/figs/demo_dd3d_kitti_val_short.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TRI-ML/dd3d/56fad8ec9eb7fbd373953f49f9372120b4cd740c/media/figs/demo_dd3d_kitti_val_short.gif -------------------------------------------------------------------------------- /media/figs/tri-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TRI-ML/dd3d/56fad8ec9eb7fbd373953f49f9372120b4cd740c/media/figs/tri-logo.png -------------------------------------------------------------------------------- /scripts/visualize_dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | import logging 4 | import sys 5 | from collections import defaultdict 6 | 7 | import cv2 8 | import hydra 9 | from tqdm import tqdm 10 | 11 | from detectron2.data import MetadataCatalog 12 | 13 | from tridet.data import build_test_dataloader, build_train_dataloader 14 | from tridet.data.dataset_mappers import get_dataset_mapper 15 | from tridet.data.datasets import register_datasets 16 | from tridet.utils.setup import setup 17 | from tridet.utils.visualization import mosaic 18 | from tridet.visualizers import get_dataloader_visualizer 19 | 20 | LOG = logging.getLogger('tridet') 21 | 22 | 23 | @hydra.main(config_path="../configs/", config_name="visualize_dataloader") 24 | def main(cfg): 25 | setup(cfg) 26 | dataset_names = register_datasets(cfg) 27 | if cfg.ONLY_REGISTER_DATASETS: 28 | return {}, cfg 29 | LOG.info(f"Registered {len(dataset_names)} datasets:" + '\n\t' + '\n\t'.join(dataset_names)) 30 | 31 | if cfg.USE_TEST: 32 | dataset_name = cfg.DATASETS.TEST.NAME 33 | mapper = get_dataset_mapper(cfg, is_train=False) 34 | dataloader, _ = build_test_dataloader(cfg, dataset_name, mapper=mapper) 35 | else: 36 | mapper = get_dataset_mapper(cfg, is_train=True) 37 | dataloader, _ = build_train_dataloader(cfg, mapper=mapper) 38 | 39 | visualizer_names = MetadataCatalog.get(cfg.DATASETS.TRAIN.NAME).loader_visualizers 40 | for batch_idx, batch in tqdm(enumerate(dataloader)): 41 | viz_images = defaultdict(dict) 42 | LOG.info("Press any key to continue, press 'q' to quit.") 43 | for viz_name in visualizer_names: 44 | viz = get_dataloader_visualizer(cfg, viz_name, cfg.DATASETS.TRAIN.NAME) 45 | for idx, x in enumerate(batch): 46 | viz_images[idx].update(viz.visualize(x)) 47 | 48 | for k in range(len(batch)): 49 | gt_viz = mosaic(list(viz_images[k].values())) 50 | cv2.imshow("dataloader", gt_viz[:, :, ::-1]) 51 | 52 | if cv2.waitKey(0) & 0xFF == ord('q'): 53 | sys.exit() 54 | 55 | 56 | if __name__ == '__main__': 57 | main() # pylint: disable=no-value-for-parameter 58 | -------------------------------------------------------------------------------- /tridet/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | -------------------------------------------------------------------------------- /tridet/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.data.build import build_test_dataloader, build_train_dataloader, collect_dataset_dicts 3 | -------------------------------------------------------------------------------- /tridet/data/augmentations/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.data.augmentations.build import build_augmentation 3 | -------------------------------------------------------------------------------- /tridet/data/augmentations/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | # Adapted from detectron2: 4 | # https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py 5 | import logging 6 | 7 | from tridet.data.augmentations.color_transform import RandomBrightness, RandomContrast, RandomSaturation 8 | from tridet.data.augmentations.crop_transform import RandomCrop 9 | from tridet.data.augmentations.flip_transform import RandomFlip 10 | from tridet.data.augmentations.resize_transform import ResizeShortestEdge 11 | 12 | LOG = logging.getLogger(__name__) 13 | 14 | 15 | def build_augmentation(cfg, is_train): 16 | """ 17 | Changes from the original function: 18 | - Move `RandomCrop` augmentation here; it's originally in dataset_mapper 19 | https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/dataset_mapper.py#L89 20 | - `RandomFlip()` is configurable. This is mostly unused for now. 21 | - `RandomCrop` uses expanded version of `CropTransform`, which handles depth, intrinsics. 22 | - `ResizeShortestEdge` uses expanded version of `ResizeTransform`, which handles depth, intrinsics. 23 | """ 24 | if not cfg.INPUT.AUG_ENABLED: 25 | return [] 26 | augmentation = [] 27 | if cfg.INPUT.CROP.ENABLED and is_train: 28 | augmentation.append(RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) 29 | 30 | # Resize augmentation. 31 | if is_train: 32 | min_size = cfg.INPUT.RESIZE.MIN_SIZE_TRAIN 33 | max_size = cfg.INPUT.RESIZE.MAX_SIZE_TRAIN 34 | sample_style = cfg.INPUT.RESIZE.MIN_SIZE_TRAIN_SAMPLING 35 | else: 36 | min_size = cfg.INPUT.RESIZE.MIN_SIZE_TEST 37 | max_size = cfg.INPUT.RESIZE.MAX_SIZE_TEST 38 | sample_style = "choice" 39 | if min_size: 40 | augmentation.append(ResizeShortestEdge(min_size, max_size, sample_style)) 41 | 42 | if cfg.INPUT.RANDOM_FLIP.ENABLED and is_train: 43 | augmentation.append(RandomFlip()) 44 | 45 | if cfg.INPUT.COLOR_JITTER.ENABLED and is_train: 46 | brightness_lower, brightness_upper = cfg.INPUT.COLOR_JITTER.BRIGHTNESS 47 | brightness_min, brightness_max = 1. - brightness_lower, 1. + brightness_upper 48 | augmentation.append(RandomBrightness(brightness_min, brightness_max)) 49 | 50 | saturation_lower, saturation_upper = cfg.INPUT.COLOR_JITTER.SATURATION 51 | saturation_min, saturation_max = 1. - saturation_lower, 1. + saturation_upper 52 | augmentation.append(RandomSaturation(saturation_min, saturation_max)) 53 | 54 | contrast_lower, contrast_upper = cfg.INPUT.COLOR_JITTER.CONTRAST 55 | contrast_min, contrast_max = 1. - contrast_lower, 1. + contrast_upper 56 | augmentation.append(RandomContrast(contrast_min, contrast_max)) 57 | 58 | if not augmentation: 59 | LOG.warning("No Augmentation!") 60 | return augmentation 61 | -------------------------------------------------------------------------------- /tridet/data/augmentations/color_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | # pylint: disable=unused-argument 3 | from fvcore.transforms.transform import BlendTransform 4 | 5 | from detectron2.data.transforms import RandomBrightness as _RandomBrightness 6 | from detectron2.data.transforms import RandomContrast as _RandomContrast 7 | from detectron2.data.transforms import RandomSaturation as _RandomSaturation 8 | 9 | 10 | def apply_no_op_intrinsics(blend_tfm, intrinsics): 11 | return intrinsics 12 | 13 | 14 | def apply_no_op_depth(blend_tfm, depth): 15 | return depth 16 | 17 | 18 | def apply_no_op_box3d(blend_tfm, box3d): 19 | return box3d 20 | 21 | 22 | # (dennis.park) Augment ResizeTransform to handle intrinsics, depth 23 | BlendTransform.register_type("intrinsics", apply_no_op_intrinsics) 24 | BlendTransform.register_type("depth", apply_no_op_depth) 25 | BlendTransform.register_type("box3d", apply_no_op_box3d) 26 | 27 | 28 | class RandomContrast(_RandomContrast): 29 | def get_transform(self, image): 30 | tfm = super().get_transform(image) 31 | return BlendTransform(tfm.src_image, tfm.src_weight, tfm.dst_weight) 32 | 33 | 34 | class RandomBrightness(_RandomBrightness): 35 | def get_transform(self, image): 36 | tfm = super().get_transform(image) 37 | return BlendTransform(tfm.src_image, tfm.src_weight, tfm.dst_weight) 38 | 39 | 40 | class RandomSaturation(_RandomSaturation): 41 | def get_transform(self, image): 42 | tfm = super().get_transform(image) 43 | return BlendTransform(tfm.src_image, tfm.src_weight, tfm.dst_weight) 44 | -------------------------------------------------------------------------------- /tridet/data/augmentations/crop_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import numpy as np 3 | from fvcore.transforms.transform import CropTransform 4 | 5 | from detectron2.data.transforms import RandomCrop as _RandomCrop 6 | 7 | 8 | def apply_imcrop_intrinsics(crop_tfm, intrinsics): 9 | assert intrinsics.shape == (3, 3) 10 | assert intrinsics[0, 1] == 0 # undistorted 11 | assert np.allclose(intrinsics, np.triu(intrinsics)) # check if upper triangular 12 | 13 | x0, y0 = crop_tfm.x0, crop_tfm.y0 14 | new_intrinsics = intrinsics.copy() 15 | new_intrinsics[0, 2] -= x0 16 | new_intrinsics[1, 2] -= y0 17 | 18 | return new_intrinsics 19 | 20 | 21 | def apply_imcrop_depth(crop_tfm, depth): 22 | assert len(depth.shape) == 2 23 | x0, y0, w, h = crop_tfm.x0, crop_tfm.y0, crop_tfm.w, crop_tfm.h 24 | return depth[y0:y0 + h, x0:x0 + w] 25 | 26 | 27 | def apply_imcrop_box3d(crop_tfm, box3d): # pylint: disable=unused-argument 28 | return box3d 29 | 30 | 31 | # (dennis.park) Augment ResizeTransform to handle intrinsics, depth 32 | CropTransform.register_type("intrinsics", apply_imcrop_intrinsics) 33 | CropTransform.register_type("depth", apply_imcrop_depth) 34 | CropTransform.register_type("box3d", apply_imcrop_box3d) 35 | 36 | 37 | class RandomCrop(_RandomCrop): 38 | def get_transform(self, image): 39 | tfm = super().get_transform(image) 40 | return CropTransform(tfm.x0, tfm.y0, tfm.w, tfm.h) 41 | -------------------------------------------------------------------------------- /tridet/data/augmentations/flip_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import numpy as np 3 | from fvcore.transforms.transform import HFlipTransform, NoOpTransform, VFlipTransform 4 | 5 | from detectron2.data.transforms import RandomFlip as _RandomFlip 6 | 7 | 8 | def apply_hflip_intrinsics(hflip_tfm, intrinsics): 9 | intrinsics[0, 2] = hflip_tfm.width - intrinsics[0, 2] 10 | return intrinsics 11 | 12 | 13 | def apply_vflip_intrinsics(vflip_tfm, intrinsics): 14 | intrinsics[1, 2] = vflip_tfm.height - intrinsics[1, 2] 15 | return intrinsics 16 | 17 | 18 | def apply_hflip_depth(hflip_tfm, depth): # pylint: disable=unused-argument 19 | assert depth.ndim == 2 20 | return np.flip(depth, axis=1).copy() 21 | 22 | 23 | def apply_vflip_depth(vflip_tfm, depth): # pylint: disable=unused-argument 24 | assert depth.ndim == 2 25 | return np.flip(depth, axis=0).copy() 26 | 27 | 28 | def apply_hflip_box3d(hflip_tfm, box3d): # pylint: disable=unused-argument 29 | """Horizontally flip 3D box. 30 | 31 | CAVEAT: This function makes assumption about the object symmetry wrt *y=0* plane. 32 | 33 | new quaternion: [quat.z, -quat.y, -quat.x, quat.w] 34 | https://stackoverflow.com/questions/32438252/efficient-way-to-apply-mirror-effect-on-quaternion-rotation 35 | 36 | Parameters 37 | ---------- 38 | hflip_tfm: HFlipTransform 39 | 40 | box3d: np.array 41 | 10D representation of 3D box. quaternion (4) + location (3) + dimension (3) 42 | 43 | Returns 44 | ------- 45 | np.array 46 | 10D representation of flipped 3D box. 47 | """ 48 | quat, tvec, dims = box3d[:4], box3d[4:7], box3d[7:] 49 | 50 | quat_new = np.float32([quat[3], -quat[2], -quat[1], quat[0]]) 51 | tvec_new = tvec.copy() 52 | tvec_new[0] = -tvec_new[0] 53 | dims_new = dims.copy() 54 | return np.concatenate([quat_new, tvec_new, dims_new]) 55 | 56 | 57 | def apply_vflip_box3d(vflip_tfm, box3d): # pylint: disable=unused-argument 58 | # TODO 59 | raise NotImplementedError() 60 | 61 | 62 | HFlipTransform.register_type("intrinsics", apply_hflip_intrinsics) 63 | HFlipTransform.register_type("depth", apply_hflip_depth) 64 | HFlipTransform.register_type("box3d", apply_hflip_box3d) 65 | 66 | VFlipTransform.register_type("intrinsics", apply_vflip_intrinsics) 67 | VFlipTransform.register_type("depth", apply_vflip_depth) 68 | VFlipTransform.register_type("box3d", apply_vflip_box3d) 69 | 70 | 71 | class RandomFlip(_RandomFlip): 72 | def get_transform(self, image): 73 | tfm = super().get_transform(image) 74 | if isinstance(tfm, NoOpTransform): 75 | return tfm 76 | elif isinstance(tfm, HFlipTransform): 77 | return HFlipTransform(tfm.width) 78 | else: 79 | assert isinstance(tfm, VFlipTransform) 80 | return VFlipTransform(tfm.height) 81 | -------------------------------------------------------------------------------- /tridet/data/augmentations/resize_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | # pylint: disable=unused-argument 3 | import cv2 4 | import numpy as np 5 | 6 | from detectron2.data.transforms import ResizeShortestEdge as _ResizeShortestEdge 7 | from detectron2.data.transforms import ResizeTransform 8 | 9 | CV2_INTERPOLATION_MODES = {"nearest": cv2.INTER_NEAREST, "linear": cv2.INTER_LINEAR, "cubic": cv2.INTER_CUBIC} 10 | DEFAULT_DEPTH_INTERPOLOATION_MODE = "nearest" 11 | 12 | 13 | def apply_imresize_intrinsics(resize_tfm, intrinsics): 14 | assert intrinsics.shape == (3, 3) 15 | assert intrinsics[0, 1] == 0 # undistorted 16 | assert np.allclose(intrinsics, np.triu(intrinsics)) # check if upper triangular 17 | 18 | factor_x = resize_tfm.new_w / resize_tfm.w 19 | factor_y = resize_tfm.new_h / resize_tfm.h 20 | new_intrinsics = intrinsics * np.float32([factor_x, factor_y, 1]).reshape(3, 1) # pylint: disable=too-many-function-args 21 | return new_intrinsics 22 | 23 | 24 | def apply_imresize_depth(resize_tfm, depth): 25 | assert depth.shape == (resize_tfm.h, resize_tfm.w) 26 | interp = CV2_INTERPOLATION_MODES[DEFAULT_DEPTH_INTERPOLOATION_MODE] 27 | resized_depth = cv2.resize(depth, (resize_tfm.new_w, resize_tfm.new_h), interpolation=interp) 28 | return resized_depth 29 | 30 | 31 | def resize_depth_preserve(resize_tfm, depth): 32 | """ 33 | Adapted from: 34 | https://github.com/TRI-ML/packnet-sfm_internal/blob/919ab604ae2319e4554d3b588877acfddf877f9c/packnet_sfm/datasets/augmentations.py#L93 35 | 36 | ------------------------------------------------------------------------------------------------------------------- 37 | 38 | Resizes depth map preserving all valid depth pixels 39 | Multiple downsampled points can be assigned to the same pixel. 40 | Parameters 41 | ---------- 42 | depth : np.array [h,w] 43 | Depth map 44 | shape : tuple (H,W) 45 | Output shape 46 | Returns 47 | ------- 48 | depth : np.array [H,W,1] 49 | Resized depth map 50 | """ 51 | assert depth.shape == (resize_tfm.h, resize_tfm.w) 52 | 53 | new_shape = (resize_tfm.new_h, resize_tfm.new_w) 54 | 55 | h, w = depth.shape 56 | x = depth.reshape(-1) 57 | # Create coordinate grid 58 | uv = np.mgrid[:h, :w].transpose(1, 2, 0).reshape(-1, 2) 59 | # Filters valid points 60 | idx = x > 0 61 | crd, val = uv[idx], x[idx] 62 | # Downsamples coordinates 63 | crd[:, 0] = (crd[:, 0] * (new_shape[0] / h)).astype(np.int32) 64 | crd[:, 1] = (crd[:, 1] * (new_shape[1] / w)).astype(np.int32) 65 | # Filters points inside image 66 | idx = (crd[:, 0] < new_shape[0]) & (crd[:, 1] < new_shape[1]) 67 | crd, val = crd[idx], val[idx] 68 | # Creates downsampled depth image and assigns points 69 | resized_depth = np.zeros(new_shape) 70 | resized_depth[crd[:, 0], crd[:, 1]] = val 71 | return resized_depth 72 | 73 | 74 | def apply_imresize_box3d(resize_tfm, box3d): 75 | return box3d 76 | 77 | 78 | # (dennis.park) Augment ResizeTransform to handle intrinsics, depth 79 | ResizeTransform.register_type("intrinsics", apply_imresize_intrinsics) 80 | # ResizeTransform.register_type("depth", apply_imresize_depth) 81 | ResizeTransform.register_type("depth", resize_depth_preserve) 82 | ResizeTransform.register_type("box3d", apply_imresize_box3d) 83 | 84 | 85 | class ResizeShortestEdge(_ResizeShortestEdge): 86 | def get_transform(self, image): 87 | tfm = super().get_transform(image) 88 | return ResizeTransform(tfm.h, tfm.w, tfm.new_h, tfm.new_w) 89 | -------------------------------------------------------------------------------- /tridet/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | # pylint: disable=no-value-for-parameter, redundant-keyword-arg 3 | from tridet.data.dataset_mappers.dataset_mapper import DefaultDatasetMapper 4 | from tridet.data.dataset_mappers.nuscenes_mapper import NuscenesDatasetMapper 5 | 6 | 7 | def get_dataset_mapper(cfg, is_train=True): 8 | if is_train: 9 | dataset_mapper_name = cfg.DATASETS.TRAIN.DATASET_MAPPER 10 | else: 11 | dataset_mapper_name = cfg.DATASETS.TEST.DATASET_MAPPER 12 | 13 | if dataset_mapper_name == "default": 14 | return DefaultDatasetMapper(cfg, is_train=is_train) 15 | elif dataset_mapper_name == "nuscenes": 16 | return NuscenesDatasetMapper(cfg, is_train=is_train) 17 | else: 18 | raise ValueError(f"Invalid dataset mapper: {dataset_mapper_name}") 19 | -------------------------------------------------------------------------------- /tridet/data/dataset_mappers/nuscenes_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import torch 3 | 4 | from tridet.data.dataset_mappers import DefaultDatasetMapper 5 | 6 | 7 | class NuscenesDatasetMapper(DefaultDatasetMapper): 8 | """ 9 | In addition to 2D / 3D boxes, each instance also has attribute and speed. 10 | 11 | Assumption: image transformation does not change attributes and speed. 12 | """ 13 | def __call__(self, dataset_dict): 14 | dataset_dict = super().__call__(dataset_dict) 15 | 16 | annos = dataset_dict['annotations'] 17 | 18 | # NuScenes attributes 19 | attributes = [obj["attribute_id"] for obj in annos] 20 | attributes = torch.tensor(attributes, dtype=torch.int64) 21 | dataset_dict['instances'].gt_attributes = attributes 22 | 23 | # Speed (magnitude of velocity) 24 | speeds = [obj["speed"] for obj in annos] 25 | speeds = torch.tensor(speeds, dtype=torch.float32) 26 | dataset_dict['instances'].gt_speeds = speeds 27 | return dataset_dict 28 | -------------------------------------------------------------------------------- /tridet/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import random 4 | from functools import partial 5 | 6 | from detectron2.data import DatasetCatalog 7 | 8 | from tridet.data.datasets.kitti_3d import register_kitti_3d_datasets 9 | from tridet.data.datasets.nuscenes import register_nuscenes_datasets 10 | 11 | 12 | def register_datasets(cfg): 13 | train_dataset_name = cfg.DATASETS.TRAIN.NAME 14 | test_dataset_name = cfg.DATASETS.TEST.NAME 15 | 16 | required_datasets = [train_dataset_name, test_dataset_name] 17 | 18 | dataset_names = [] 19 | dataset_names.extend(register_kitti_3d_datasets(required_datasets, cfg)) 20 | dataset_names.extend(register_nuscenes_datasets(required_datasets, cfg)) 21 | if cfg.ONLY_REGISTER_DATASETS: 22 | for name in dataset_names: 23 | DatasetCatalog.get(name) 24 | return dataset_names 25 | 26 | 27 | def random_sample_dataset_dicts(dataset_name, num_samples=10): 28 | dataset_dicts = DatasetCatalog.get(dataset_name) 29 | num_samples = min(num_samples, len(dataset_dicts)) 30 | random.seed(42) 31 | if num_samples > 0: 32 | inds = random.sample(range(len(dataset_dicts)), k=num_samples) 33 | else: 34 | # Use all dataset items. 35 | inds = list(range(len(dataset_dicts))) 36 | samples = [dataset_dicts[i] for i in inds] 37 | return samples, inds 38 | -------------------------------------------------------------------------------- /tridet/data/datasets/kitti_3d/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import os 4 | from functools import partial 5 | 6 | from detectron2.data import DatasetCatalog 7 | 8 | from tridet.data.datasets.kitti_3d.build import build_monocular_kitti3d_dataset, register_kitti_3d_metadata 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | KITTI_ROOT = 'KITTI3D' 13 | 14 | DATASET_DICTS_BUILDER = { 15 | # Monocular datasets 16 | "kitti_3d_train": (build_monocular_kitti3d_dataset, dict(mv3d_split='train')), 17 | "kitti_3d_train_project_box3d": (build_monocular_kitti3d_dataset, dict(mv3d_split='train', box2d_from_box3d=True)), 18 | "kitti_3d_train_right_cam": (build_monocular_kitti3d_dataset, dict(mv3d_split='train', sensors=('camera_3', ))), 19 | "kitti_3d_train_both_cams": 20 | (build_monocular_kitti3d_dataset, dict(mv3d_split='train', sensors=('camera_2', 'camera_3'))), 21 | "kitti_3d_val": (build_monocular_kitti3d_dataset, dict(mv3d_split='val')), 22 | "kitti_3d_trainval": (build_monocular_kitti3d_dataset, dict(mv3d_split='trainval')), 23 | "kitti_3d_test": (build_monocular_kitti3d_dataset, dict(mv3d_split='test')), 24 | "kitti_3d_overfit": (build_monocular_kitti3d_dataset, dict(mv3d_split='train', max_num_items=32)), 25 | } 26 | 27 | METADATA_BUILDER = {name: (register_kitti_3d_metadata, {}) for name in DATASET_DICTS_BUILDER.keys()} 28 | 29 | 30 | def register_kitti_3d_datasets(required_datasets, cfg): 31 | kitti_3d_datasets = sorted(list(set(required_datasets).intersection(DATASET_DICTS_BUILDER.keys()))) 32 | if kitti_3d_datasets: 33 | LOG.info(f"KITTI-3D dataset(s): {', '.join(kitti_3d_datasets)} ") 34 | for name in kitti_3d_datasets: 35 | fn, kwargs = DATASET_DICTS_BUILDER[name] 36 | kwargs.update({'root_dir': os.path.join(cfg.DATASET_ROOT, KITTI_ROOT)}) 37 | DatasetCatalog.register(name, partial(fn, **kwargs)) 38 | 39 | fn, kwargs = METADATA_BUILDER[name] 40 | kwargs.update({'coco_cache_dir': cfg.TMP_DIR}) 41 | fn(name, **kwargs) 42 | return kitti_3d_datasets 43 | -------------------------------------------------------------------------------- /tridet/data/datasets/nuscenes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import os 4 | from functools import partial 5 | 6 | from detectron2.data import DatasetCatalog 7 | from detectron2.utils.comm import get_world_size 8 | 9 | from tridet.data.datasets.nuscenes.build import build_nuscenes_dataset, register_nuscenes_metadata 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | NUSCENES_ROOT = "nuScenes" 14 | 15 | NUSC_DATASET_NAMES = [ 16 | "nusc_train", 17 | "nusc_val", 18 | "nusc_val-subsample-8", 19 | "nusc_trainval", 20 | "nusc_test", 21 | "nusc_mini_train", 22 | "nusc_mini_val", 23 | ] 24 | 25 | DATASET_DICTS_BUILDER = {name: (build_nuscenes_dataset, dict(name=name)) for name in NUSC_DATASET_NAMES} 26 | 27 | METADATA_BUILDER = {name: (register_nuscenes_metadata, {}) for name in DATASET_DICTS_BUILDER.keys()} 28 | 29 | 30 | def register_nuscenes_datasets(required_datasets, cfg): 31 | if cfg.DATASETS.TEST.NAME in ("nusc_train", "nusc_val", "nusc_trainval", "nusc_test") and \ 32 | get_world_size() > 1: 33 | raise LOG.warning("The distributed evaluation does not work well with large test set for now. " \ 34 | f"If program hangs, consider using non-distributed evaluation: {cfg.DATASETS.TEST.NAME}") 35 | 36 | nusc_datasets = sorted(list(set(required_datasets).intersection(DATASET_DICTS_BUILDER.keys()))) 37 | if nusc_datasets: 38 | LOG.info(f"nuScenes-3D dataset(s): {', '.join(nusc_datasets)} ") 39 | for name in nusc_datasets: 40 | fn, kwargs = DATASET_DICTS_BUILDER[name] 41 | kwargs.update({ 42 | 'root_dir': os.path.join(cfg.DATASET_ROOT, NUSCENES_ROOT), 43 | 'min_num_lidar_points': cfg.DATASETS.TRAIN.MIN_NUM_LIDAR_PTS, 44 | 'min_box_visibility': cfg.DATASETS.TRAIN.MIN_BOX_VISIBILITY 45 | }) 46 | DatasetCatalog.register(name, partial(fn, **kwargs)) 47 | 48 | fn, kwargs = METADATA_BUILDER[name] 49 | fn(name, **kwargs) 50 | return nusc_datasets 51 | -------------------------------------------------------------------------------- /tridet/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.data.samplers.group_sampler import InferenceGroupSampler 3 | -------------------------------------------------------------------------------- /tridet/data/samplers/group_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from torch.utils.data.sampler import Sampler 3 | 4 | from detectron2.utils import comm 5 | 6 | 7 | class InferenceGroupSampler(Sampler): 8 | """ 9 | Assumptions: 10 | 1) The dataset consists of in-order groups, i.e. [*group-1-items, *group-2-items, ...] 11 | 2) In the dataloader, per-gpu batch size (i.e. total_batch_size / world_size) must be 12 | a multiple of the group size. CAVEAT: this may cause CUDA OOM. 13 | """ 14 | def __init__(self, total_size, group_size): 15 | """ 16 | Args: 17 | size (int): the total number of data of the underlying dataset to sample from 18 | """ 19 | assert total_size > 0 and group_size > 0 20 | assert total_size % group_size == 0, \ 21 | f"The total size must be divisible by group size: total size={total_size}, group size={group_size}" 22 | 23 | self._total_size = total_size 24 | self._group_size = group_size 25 | self._rank = comm.get_rank() 26 | self._world_size = comm.get_world_size() 27 | 28 | self._num_groups = total_size // group_size 29 | 30 | shard_size = ((self._num_groups - 1) // self._world_size + 1) * self._group_size 31 | 32 | # shard_size = (self._total_size - 1) // self._world_size + 1 33 | begin = shard_size * self._rank 34 | end = min(shard_size * (self._rank + 1), self._total_size) 35 | self._local_indices = range(begin, end) 36 | 37 | def __iter__(self): 38 | yield from self._local_indices 39 | 40 | def __len__(self): 41 | return len(self._local_indices) 42 | -------------------------------------------------------------------------------- /tridet/data/transform_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | # Adapted from detectron2: 4 | # https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py 5 | import numpy as np 6 | import torch 7 | 8 | from detectron2.data import transforms as T 9 | from detectron2.structures import Boxes, BoxMode, Instances 10 | 11 | from tridet.structures.boxes3d import Boxes3D 12 | 13 | __all__ = ["transform_instance_annotations", "annotations_to_instances"] 14 | 15 | 16 | def transform_instance_annotations( 17 | annotation, 18 | transforms, 19 | image_size, 20 | ): 21 | """Adapted from: 22 | https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254 23 | 24 | The changes from original: 25 | - The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional. 26 | - Add optional 3D bounding box support. 27 | - If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory. 28 | 29 | =============================================================================================================== 30 | 31 | Apply transforms to box, segmentation and keypoints annotations of a single instance. 32 | 33 | It will use `transforms.apply_box` for the box, and 34 | `transforms.apply_coords` for segmentation polygons & keypoints. 35 | If you need anything more specially designed for each data structure, 36 | you'll need to implement your own version of this function or the transforms. 37 | 38 | Args: 39 | annotation (dict): dict of instance annotations for a single instance. 40 | It will be modified in-place. 41 | transforms (TransformList or list[Transform]): 42 | image_size (tuple): the height, width of the transformed image 43 | keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. 44 | 45 | Returns: 46 | dict: 47 | the same input dict with fields "bbox", "segmentation", "keypoints" 48 | transformed according to `transforms`. 49 | The "bbox_mode" field will be set to XYXY_ABS. 50 | """ 51 | if isinstance(transforms, (tuple, list)): 52 | transforms = T.TransformList(transforms) 53 | # (dennis.park) Here 2D bounding box is optional. 54 | if "bbox" in annotation: 55 | assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not." 56 | # bbox is 1d (per-instance bounding box) 57 | bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) 58 | bbox = transforms.apply_box(np.array([bbox]))[0] 59 | # clip transformed bbox to image size 60 | bbox = bbox.clip(min=0) 61 | bbox = np.minimum(bbox, list(image_size + image_size)[::-1]) 62 | annotation["bbox"] = bbox 63 | annotation["bbox_mode"] = BoxMode.XYXY_ABS 64 | 65 | # Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed. 66 | if "bbox3d" in annotation: 67 | bbox3d = np.array(annotation["bbox3d"]) 68 | annotation['bbox3d'] = transforms.apply_box3d(bbox3d) 69 | 70 | return annotation 71 | 72 | 73 | def _create_empty_instances(image_size): 74 | target = Instances(image_size) 75 | 76 | target.gt_boxes = Boxes([]) 77 | target.gt_classes = torch.tensor([], dtype=torch.int64) 78 | target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32)) 79 | 80 | return target 81 | 82 | 83 | def annotations_to_instances( 84 | annos, 85 | image_size, 86 | intrinsics=None, 87 | ): 88 | """ 89 | Create an :class:`Instances` object used by the models, 90 | from instance annotations in the dataset dict. 91 | 92 | Args: 93 | annos (list[dict]): a list of instance annotations in one image, each 94 | element for one instance. 95 | image_size (tuple): height, width 96 | 97 | Returns: 98 | Instances: 99 | It will contain fields "gt_boxes", "gt_classes", 100 | "gt_masks", "gt_keypoints", if they can be obtained from `annos`. 101 | This is the format that builtin models expect. 102 | """ 103 | if len(annos) == 0: 104 | return _create_empty_instances(image_size) 105 | 106 | boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] 107 | target = Instances(image_size) 108 | target.gt_boxes = Boxes(boxes) 109 | 110 | classes = [obj["category_id"] for obj in annos] 111 | classes = torch.tensor(classes, dtype=torch.int64) 112 | target.gt_classes = classes 113 | 114 | if len(annos) and "bbox3d" in annos[0]: 115 | assert intrinsics is not None 116 | target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics) 117 | if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]: 118 | raise ValueError( 119 | f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}." 120 | ) 121 | 122 | return target 123 | -------------------------------------------------------------------------------- /tridet/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import inspect 3 | import logging 4 | import os 5 | 6 | from detectron2.evaluation import COCOEvaluator, SemSegEvaluator 7 | 8 | from tridet.data.datasets.nuscenes import NUSCENES_ROOT 9 | from tridet.evaluators.kitti_3d_evaluator import KITTI3DEvaluator 10 | from tridet.evaluators.nuscenes_evaluator import NuscenesEvaluator 11 | from tridet.utils.comm import is_distributed 12 | 13 | LOG = logging.getLogger('tridet') 14 | 15 | AVAILABLE_EVALUATORS = ["coco_evaluator", "kitti3d_evaluator", "nuscenes_evaluator"] 16 | 17 | 18 | def get_evaluator(cfg, dataset_name, evaluator_name, output_dir): 19 | assert evaluator_name in AVAILABLE_EVALUATORS, f"Invalid evaluator name: {evaluator_name}." 20 | 21 | distributed = is_distributed() 22 | 23 | if evaluator_name == "coco_evaluator": 24 | tasks = [] 25 | assert cfg.MODEL.BOX2D_ON 26 | tasks.append('bbox') 27 | return COCOEvaluator(dataset_name, tuple(tasks), distributed=distributed, output_dir=output_dir) 28 | elif evaluator_name == "kitti3d_evaluator": 29 | return KITTI3DEvaluator( 30 | dataset_name=dataset_name, 31 | iou_thresholds=cfg.EVALUATORS.KITTI3D.IOU_THRESHOLDS, 32 | only_prepare_submission=cfg.EVALUATORS.KITTI3D.ONLY_PREPARE_SUBMISSION, 33 | distributed=distributed, 34 | output_dir=output_dir, 35 | ) 36 | elif evaluator_name == "nuscenes_evaluator": 37 | nusc_root = os.path.join(cfg.DATASET_ROOT, NUSCENES_ROOT) 38 | return NuscenesEvaluator(nusc_root=nusc_root, dataset_name=dataset_name, output_dir=output_dir) 39 | -------------------------------------------------------------------------------- /tridet/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.layers.bev_nms import bev_nms 3 | from tridet.layers.iou_loss import IOULoss 4 | from tridet.layers.smooth_l1_loss import smooth_l1_loss 5 | -------------------------------------------------------------------------------- /tridet/layers/bev_nms.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | 4 | import numpy as np 5 | import torch 6 | from pytorch3d.transforms import transform3d as t3d 7 | 8 | from detectron2.layers.nms import batched_nms_rotated 9 | from detectron2.structures import RotatedBoxes 10 | 11 | from tridet.structures.pose import Pose 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | # yapf: disable 16 | # ------------------------------- 17 | # Convention of reference frames. 18 | # ------------------------------- 19 | # Rotation from "camera" frame to "vehicle" frame. 20 | # |------------------|--------------------------------| 21 | # | Camera | Vehicle | Interpretation in Vehicle frame| 22 | # |------------------|--------------------------------| 23 | # | z | x | forward | 24 | # | x | -y | right | 25 | # | y | -z | down | 26 | # |------------------|--------------------------------| 27 | CAMERA_TO_VEHICLE_ROTATION = Pose.from_matrix(np.float32([ 28 | [ 0, 0, 1, 0], 29 | [-1, 0, 0, 0], 30 | [ 0, -1, 0, 0], 31 | [ 0, 0, 0, 1] 32 | ])) 33 | 34 | # Rotation from "vehicle" frame to "bev" frame. 35 | # |------------------|---------------------------------| 36 | # | Vehicle | BEV | Interpretation in Vehicle frame | 37 | # |------------------|---------------------------------| 38 | # | x | -y | forward | 39 | # | y | -x | left | 40 | # | z | -z | up | 41 | # |------------------|---------------------------------| 42 | VEHICLE_TO_BEV_ROTATION = Pose.from_matrix(np.float32([ 43 | [ 0, -1, 0, 0], 44 | [-1, 0, 0, 0], 45 | [ 0, 0, -1, 0], 46 | [ 0, 0, 0, 1] 47 | ])) 48 | # yapf: enable 49 | 50 | 51 | def boxes3d_to_rotated_boxes( 52 | boxes3d, pose_cam_global=CAMERA_TO_VEHICLE_ROTATION, pose_global_bev=VEHICLE_TO_BEV_ROTATION, use_top_surface=True 53 | ): 54 | """ 55 | 56 | Parameters 57 | ---------- 58 | boxes3d: Boxes3D 59 | 3D boxes in camera frame. 60 | pose_cam_global: Pose 61 | Transformation from sensor (camera) frame to global frame. Depending on the context, global frame can be 62 | "vehicle" frame which moves along with the vehicle, or "world" frame which is fixed in the world. 63 | By default, it is an axis-swapping rotation that convert pinhole camera frame to Vehicle frame, i.e. 64 | x: forward, y: left, z: up (see above for detail.) 65 | with no translation (i.e. moves along with camera). 66 | pose_global_bev: Pose 67 | Transformation from global frame to bird-eye-view frame. By default, "forward" matches with "up" of BEV image, 68 | By default, it is an axis-swapping rotation that converts Vehicle frame to BEV frame (see above for detail.) 69 | with no translation. 70 | """ 71 | if use_top_surface: 72 | vertice_inds = [0, 1, 5, 4] # (front-left, front-right, back-right, back-left) of top surface. 73 | else: 74 | # use bottom surface. 75 | vertice_inds = [3, 2, 6, 7] # (front-left, front-right, back-right, back-left) of bottom surface. 76 | 77 | surface = boxes3d.corners[:, vertice_inds, :] 78 | pose_cam_bev = pose_global_bev * pose_cam_global 79 | cam_to_bev = t3d.Transform3d(matrix=surface.new_tensor(pose_cam_bev.matrix.T)) # Need to transpose! 80 | # Assumpiton: this is close to rectangles. TODO: assert it? 81 | rot_boxes_bev = cam_to_bev.transform_points(surface)[:, :, :2] 82 | 83 | # length/width of objects are equivalent to "height"/width of RotatedBoxes 84 | length = torch.norm(rot_boxes_bev[:, 0, :] - rot_boxes_bev[:, 3, :], dim=1).abs() 85 | width = torch.norm(rot_boxes_bev[:, 0, :] - rot_boxes_bev[:, 1, :], dim=1).abs() 86 | 87 | center = torch.mean(rot_boxes_bev[:, [0, 2], :], dim=1) 88 | center_x, center_y = center[:, 0], center[:, 1] 89 | 90 | forward = rot_boxes_bev[:, 0, :] - rot_boxes_bev[:, 3, :] 91 | # CCW-angle, i.e. rotation wrt -z (or "up") in BEV frame. 92 | angle = torch.atan2(forward[:, 0], forward[:, 1]) 93 | angle = 180. / np.pi * angle 94 | 95 | rot_boxes = RotatedBoxes(torch.stack([center_x, center_y, width, length, angle], dim=1)) 96 | return rot_boxes 97 | 98 | 99 | def bev_nms( 100 | boxes3d, scores, iou_threshold, pose_cam_global=CAMERA_TO_VEHICLE_ROTATION, class_idxs=None, class_agnostic=False 101 | ): 102 | """ 103 | 104 | Parameters 105 | ---------- 106 | boxes3d: Boxes3D 107 | 3D boxes in camera frame. 108 | 109 | scores: Tensor 110 | 1D score vector. Must be of same size 'boxes3d' 111 | 112 | iou_threshold: float 113 | Two rotated boxes in BEV frame cannot overlap (according to IoU) more than this threshold. 114 | 115 | class_idxs: Tensor or None 116 | If not None, 1D integer vector. Must be of same size 'boxes3d' 117 | 118 | class_agnostic: bool 119 | If True, then category ID is not considered in NMS. 120 | If False, then NMS is performed per-cateogry ('class_idxs' must not be None.) 121 | 122 | Returns 123 | ------- 124 | keep: Tensor 125 | 1D integer vector that contains filtered indices to 'boxes3d' to keep after NMS. 126 | """ 127 | rot_boxes = boxes3d_to_rotated_boxes(boxes3d, pose_cam_global=pose_cam_global) 128 | if class_agnostic: 129 | class_idxs = torch.zeros_like(scores, dtype=torch.int64) 130 | else: 131 | assert class_idxs is not None 132 | keep = batched_nms_rotated(rot_boxes.tensor, scores, class_idxs, iou_threshold) 133 | return keep 134 | -------------------------------------------------------------------------------- /tridet/layers/conv_bn_fpn_layers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from detectron2.layers import Conv2d, ShapeSpec 5 | from torch import nn 6 | 7 | from tridet.layers.normalization import get_norm 8 | from tridet.layers.separable_conv2d import ACTIVATIONS 9 | from tridet.modeling.dd3d.utils import get_fpn_out_channels 10 | 11 | 12 | class ConvBnFpnLayers(nn.Module): 13 | """ 14 | """ 15 | def __init__( 16 | self, 17 | num_layers, 18 | input_shape, 19 | norm_kwargs={}, 20 | kernel_size=3, 21 | activation='gelu', 22 | groups=1, 23 | extra_input_dim=0, 24 | use_input_dim=True, 25 | output_dim=None, 26 | ): 27 | super().__init__() 28 | assert kernel_size % 2 == 1, "'kernel_size' must be odd." 29 | self._input_shape = input_shape 30 | self._extra_input_dim = extra_input_dim 31 | num_levels = len(input_shape) 32 | channels = get_fpn_out_channels(input_shape) 33 | 34 | if not use_input_dim: 35 | assert output_dim is not None, "'output_dim' must be given, if 'use_input_dim=False'." 36 | input_dim = channels + extra_input_dim 37 | out_channels = input_dim if use_input_dim else output_dim 38 | self._out_channels = out_channels 39 | 40 | conv_layers = [] 41 | for l in range(num_layers): 42 | in_channels = input_dim if l == 0 else out_channels 43 | # Build convolution layers 44 | conv_kwargs = dict( 45 | in_channels=in_channels, 46 | out_channels=out_channels, 47 | kernel_size=kernel_size, 48 | stride=1, 49 | padding=kernel_size // 2, 50 | bias=False, # BN is applied manually in forward() 51 | norm=None, 52 | activation=None, # activation is applied manually in forward() 53 | groups=groups 54 | ) 55 | conv_layers.append(Conv2d(**conv_kwargs)) 56 | self.conv_layers = nn.ModuleList(conv_layers) 57 | 58 | # Define a BN layer per each (level, layer). 59 | self.bn_layers = nn.ModuleList() 60 | norm_kwargs = norm_kwargs or {} 61 | for _ in range(num_levels): 62 | self.bn_layers.append(nn.ModuleList([get_norm('BN', out_channels, norm_kwargs) for _ in range(num_layers)])) 63 | 64 | # Activation 65 | self.act = ACTIVATIONS[activation] 66 | 67 | self.init_weights() 68 | 69 | def output_shape(self): 70 | return [ 71 | ShapeSpec(channels=self._out_channels, height=x.height, width=x.width, stride=x.stride) 72 | for x in self._input_shape 73 | ] 74 | 75 | def init_weights(self): 76 | for conv in self.conv_layers: 77 | nn.init.kaiming_normal_(conv.weight) # mode = 'fan_in' 78 | 79 | def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: 80 | out = [] 81 | for level, _bn_layers in enumerate(self.bn_layers): # iterating over first bn dim first makes TS happy 82 | x_level = x[level] 83 | for conv, bn in zip(self.conv_layers, _bn_layers): 84 | x_level = conv(x_level) 85 | x_level = bn(x_level) 86 | x_level = self.act(x_level) 87 | out.append(x_level) 88 | return out 89 | -------------------------------------------------------------------------------- /tridet/layers/iou_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | # Adapted from AdelaiDet: 3 | # https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class IOULoss(nn.Module): 9 | """ 10 | Intersetion Over Union (IoU) loss which supports three 11 | different IoU computations: 12 | 13 | * IoU 14 | * Linear IoU 15 | * gIoU 16 | """ 17 | def __init__(self, loc_loss_type='iou'): 18 | super(IOULoss, self).__init__() 19 | self.loc_loss_type = loc_loss_type 20 | 21 | def forward(self, pred, target, weight=None): 22 | """ 23 | Args: 24 | pred: Nx4 predicted bounding boxes 25 | target: Nx4 target bounding boxes 26 | weight: N loss weight for each instance 27 | """ 28 | pred_left = pred[:, 0] 29 | pred_top = pred[:, 1] 30 | pred_right = pred[:, 2] 31 | pred_bottom = pred[:, 3] 32 | 33 | target_left = target[:, 0] 34 | target_top = target[:, 1] 35 | target_right = target[:, 2] 36 | target_bottom = target[:, 3] 37 | 38 | target_aera = (target_left + target_right) * \ 39 | (target_top + target_bottom) 40 | pred_aera = (pred_left + pred_right) * \ 41 | (pred_top + pred_bottom) 42 | 43 | w_intersect = torch.min(pred_left, target_left) + \ 44 | torch.min(pred_right, target_right) 45 | h_intersect = torch.min(pred_bottom, target_bottom) + \ 46 | torch.min(pred_top, target_top) 47 | 48 | g_w_intersect = torch.max(pred_left, target_left) + \ 49 | torch.max(pred_right, target_right) 50 | g_h_intersect = torch.max(pred_bottom, target_bottom) + \ 51 | torch.max(pred_top, target_top) 52 | ac_uion = g_w_intersect * g_h_intersect 53 | 54 | area_intersect = w_intersect * h_intersect 55 | area_union = target_aera + pred_aera - area_intersect 56 | 57 | ious = (area_intersect + 1.0) / (area_union + 1.0) 58 | gious = ious - (ac_uion - area_union) / ac_uion 59 | if self.loc_loss_type == 'iou': 60 | losses = -torch.log(ious) 61 | elif self.loc_loss_type == 'linear_iou': 62 | losses = 1 - ious 63 | elif self.loc_loss_type == 'giou': 64 | losses = 1 - gious 65 | else: 66 | raise NotImplementedError 67 | 68 | if weight is not None: 69 | return (losses * weight).sum() 70 | else: 71 | return losses.sum() 72 | -------------------------------------------------------------------------------- /tridet/layers/normalization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | # Adapted from AdelaiDet 3 | # https://github.com/aim-uofa/AdelaiDet/ 4 | import logging 5 | from functools import partial 6 | 7 | import torch 8 | from torch import nn 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | class Scale(nn.Module): 14 | def __init__(self, init_value=1.0): 15 | super(Scale, self).__init__() 16 | self.scale = nn.Parameter(torch.FloatTensor([init_value])) 17 | 18 | def forward(self, input): 19 | return input * self.scale 20 | 21 | 22 | class Offset(nn.Module): 23 | def __init__(self, init_value=0.): 24 | super(Offset, self).__init__() 25 | self.bias = nn.Parameter(torch.FloatTensor([init_value])) 26 | 27 | def forward(self, input): 28 | return input + self.bias 29 | 30 | 31 | class ModuleListDial(nn.ModuleList): 32 | def __init__(self, modules=None): 33 | super(ModuleListDial, self).__init__(modules) 34 | self.cur_position = 0 35 | 36 | def forward(self, x): 37 | result = self[self.cur_position](x) 38 | self.cur_position += 1 39 | if self.cur_position >= len(self): 40 | self.cur_position = 0 41 | return result 42 | 43 | class DialableModules(nn.ModuleList): 44 | """ 45 | Dialable modules. Typically used with hierarchical output from FPN feature extractors. 46 | Separate modules are applied to each FPN layer. 47 | """ 48 | def __init__(self, modules=None): 49 | super(DialableModules, self).__init__(modules) 50 | self.cur_position = 0 51 | 52 | def forward(self, x): 53 | result = self[self.cur_position](x) 54 | self.cur_position += 1 55 | if self.cur_position >= len(self): 56 | self.cur_position = 0 57 | return result 58 | 59 | 60 | class DialableBN(DialableModules): 61 | """ 62 | Dialable batch-norm layers. Typical use case: all FPN layers shares a 2D convolutional decoder, but 63 | the batch-norm layers are not shared. That is, each FPN layers has its own shift and scale parameters, and keeps 64 | its own batch statistics (mean, scale). 65 | """ 66 | def __init__(self, out_channels, num_bn_modules, **bn_kwargs): 67 | LOG.info(f"Initializing DialableBN with `num_bn_modules`={num_bn_modules}") 68 | bn_modules = [nn.BatchNorm2d(out_channels, **bn_kwargs) for _ in range(num_bn_modules)] 69 | super().__init__(bn_modules) 70 | 71 | def get_norm(norm, out_channels, norm_kwargs={}): 72 | if not norm: 73 | return None 74 | 75 | norm_mapping = { 76 | "BN": nn.BatchNorm2d, 77 | "DialableBN": DialableBN, 78 | "GN": nn.GroupNorm, 79 | } 80 | 81 | norm_fn = partial(norm_mapping[norm], **norm_kwargs) 82 | if norm == "BN": 83 | return norm_fn(num_features=out_channels) 84 | elif norm == "DialableBN": 85 | return norm_fn(out_channels=out_channels) 86 | elif norm == "GN": 87 | return norm_fn(num_channels=out_channels) 88 | -------------------------------------------------------------------------------- /tridet/layers/separable_conv2d.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from detectron2.layers import Conv2d 5 | from torch import nn 6 | from torch.nn import functional as F 7 | from torch.nn.init import _calculate_correct_fan, calculate_gain 8 | 9 | from tridet.layers.normalization import get_norm 10 | 11 | ACTIVATIONS = { 12 | 'relu': F.relu, 13 | 'gelu': F.gelu, 14 | } 15 | 16 | 17 | def kaiming_uniform_groups_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', groups=1): 18 | """'torch.nn.init.kaiming_uniform_()' with 'groups'. 19 | 20 | If 'mode=="fan_out"', fan is divided by 'groups', yielding larger std of weights. 21 | """ 22 | if 0 in tensor.shape: 23 | return tensor 24 | fan = _calculate_correct_fan(tensor, mode) 25 | if mode == 'fan_out': 26 | fan //= groups 27 | gain = calculate_gain(nonlinearity, a) 28 | std = gain / math.sqrt(fan) 29 | bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation 30 | with torch.no_grad(): 31 | return tensor.uniform_(-bound, bound) 32 | 33 | 34 | def kaiming_normal_groups_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', groups=1): 35 | """'torch.nn.init.kaiming_normal_()' with 'groups'. 36 | 37 | If 'mode=="fan_out"', fan is divided by 'groups', yielding larger std of weights. 38 | """ 39 | if 0 in tensor.shape: 40 | return tensor 41 | fan = _calculate_correct_fan(tensor, mode) 42 | if mode == 'fan_out': 43 | fan //= groups 44 | gain = calculate_gain(nonlinearity, a) 45 | std = gain / math.sqrt(fan) 46 | with torch.no_grad(): 47 | return tensor.normal_(0, std) 48 | 49 | 50 | class SeparableConv2d(nn.Module): 51 | """ Separable Conv 52 | """ 53 | def __init__( 54 | self, 55 | in_channels, 56 | out_channels, 57 | kernel_size=3, 58 | stride=1, 59 | dilation=1, 60 | bias=None, 61 | channel_multiplier=1.0, 62 | num_in_channels_per_group=1, # depth-separable conv. 63 | norm='BN', 64 | norm_kwargs={}, 65 | activation=None, 66 | ): 67 | super().__init__() 68 | assert kernel_size % 2 == 1, "kernel_size must be odd." 69 | assert in_channels % num_in_channels_per_group == 0, "'in_channels' must be divisible by 'num_in_channels_per_group'" 70 | hidden_channels = int(in_channels * channel_multiplier) 71 | groups = in_channels // num_in_channels_per_group 72 | self.conv_dw = Conv2d( 73 | in_channels, 74 | hidden_channels, 75 | kernel_size=kernel_size, 76 | stride=stride, 77 | padding=kernel_size // 2, 78 | dilation=dilation, 79 | bias=False, 80 | norm=None, 81 | activation=None, 82 | groups=groups 83 | ) 84 | 85 | norm_kwargs = norm_kwargs or {} 86 | norm_layer = get_norm(norm, hidden_channels, norm_kwargs=norm_kwargs) if isinstance(norm, str) else norm 87 | if bias is None: 88 | bias = norm_layer is None 89 | act = ACTIVATIONS[activation] if isinstance(activation, str) else activation 90 | self.conv_pw = Conv2d( 91 | hidden_channels, out_channels, kernel_size=1, stride=1, bias=bias, norm=norm_layer, activation=act 92 | ) 93 | 94 | self.groups = in_channels 95 | self.init_weights() 96 | 97 | def init_weights(self): 98 | # This seems important to make the network output roughly zero-mean, unit-std. 99 | kaiming_normal_groups_(self.conv_dw.weight, mode='fan_out', nonlinearity='linear', groups=self.groups) 100 | kaiming_normal_groups_(self.conv_pw.weight, mode='fan_out', nonlinearity='relu', groups=1) 101 | if self.conv_pw.bias is not None: 102 | nn.init.constant_(self.conv_pw.bias, 0) 103 | 104 | def forward(self, x): 105 | return self.conv_pw(self.conv_dw(x)) 106 | -------------------------------------------------------------------------------- /tridet/layers/smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | # Adapted from fvcore: 4 | # https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py 5 | 6 | import torch 7 | 8 | 9 | def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor: 10 | """ 11 | Smooth L1 loss defined in the Fast R-CNN paper as: 12 | 13 | | 0.5 * x ** 2 / beta if abs(x) < beta 14 | smoothl1(x) = | 15 | | abs(x) - 0.5 * beta otherwise, 16 | 17 | where x = input - target. 18 | 19 | Smooth L1 loss is related to Huber loss, which is defined as: 20 | 21 | | 0.5 * x ** 2 if abs(x) < beta 22 | huber(x) = | 23 | | beta * (abs(x) - 0.5 * beta) otherwise 24 | 25 | Smooth L1 loss is equal to huber(x) / beta. This leads to the following 26 | differences: 27 | 28 | - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss 29 | converges to a constant 0 loss. 30 | - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss 31 | converges to L2 loss. 32 | - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant 33 | slope of 1. For Huber loss, the slope of the L1 segment is beta. 34 | 35 | Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta 36 | portion replaced with a quadratic function such that at abs(x) = beta, its 37 | slope is 1. The quadratic segment smooths the L1 loss near x = 0. 38 | 39 | Args: 40 | input (Tensor): input tensor of any shape 41 | target (Tensor): target value tensor with the same shape as input 42 | beta (float): L1 to L2 change point. 43 | For beta values < 1e-5, L1 loss is computed. 44 | reduction: 'none' | 'mean' | 'sum' 45 | 'none': No reduction will be applied to the output. 46 | 'mean': The output will be averaged. 47 | 'sum': The output will be summed. 48 | 49 | Returns: 50 | The loss with the reduction option applied. 51 | 52 | Note: 53 | PyTorch's builtin "Smooth L1 loss" implementation does not actually 54 | implement Smooth L1 loss, nor does it implement Huber loss. It implements 55 | the special case of both in which they are equal (beta=1). 56 | See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss. 57 | """ 58 | # (dennis.park) Make it work with mixed precision training. 59 | beta = torch.as_tensor(beta).to(input.dtype) 60 | if beta < 1e-5: 61 | # if beta == 0, then torch.where will result in nan gradients when 62 | # the chain rule is applied due to pytorch implementation details 63 | # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of 64 | # zeros, rather than "no gradient"). To avoid this issue, we define 65 | # small values of beta to be exactly l1 loss. 66 | loss = torch.abs(input - target) 67 | else: 68 | n = torch.abs(input - target) 69 | cond = n < beta 70 | a = 0.5 * n**2 71 | b = n - 0.5 * beta 72 | a, b = a.to(input.dtype), b.to(input.dtype) 73 | loss = torch.where(cond, a, b) 74 | # loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) 75 | 76 | if reduction == "mean": 77 | loss = loss.mean() 78 | elif reduction == "sum": 79 | loss = loss.sum() 80 | return loss 81 | -------------------------------------------------------------------------------- /tridet/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import tridet.modeling.dd3d 3 | from tridet.modeling import feature_extractor 4 | from tridet.modeling.dd3d import DD3DWithTTA, NuscenesDD3DWithTTA 5 | 6 | TTA_MODELS = { 7 | "DD3D": DD3DWithTTA, 8 | "NuscenesDD3D": NuscenesDD3DWithTTA, 9 | } 10 | 11 | 12 | def build_tta_model(cfg, model): 13 | meta_arch = cfg.MODEL.META_ARCHITECTURE 14 | assert meta_arch in TTA_MODELS, f"Test-time augmentation model is not available: {meta_arch}" 15 | return TTA_MODELS[meta_arch](cfg, model) 16 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/act.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple, Type, Union 2 | 3 | from torch import nn 4 | 5 | __all__ = ["build_activation"] 6 | 7 | # register activation function here 8 | # name: module, kwargs with default values 9 | REGISTERED_ACT_DICT: Dict[str, Tuple[Type, Dict[str, Any]]] = { 10 | "relu": (nn.ReLU, {"inplace": True}), 11 | "relu6": (nn.ReLU6, {"inplace": True}), 12 | "leaky_relu": (nn.LeakyReLU, {"inplace": True, "negative_slope": 0.1}), 13 | "h_swish": (nn.Hardswish, {"inplace": True}), 14 | "h_sigmoid": (nn.Hardsigmoid, {"inplace": True}), 15 | "swish": (nn.SiLU, {"inplace": True}), 16 | "silu": (nn.SiLU, {"inplace": True}), 17 | "tanh": (nn.Tanh, {}), 18 | "sigmoid": (nn.Sigmoid, {}), 19 | "gelu": (nn.GELU, {}), 20 | "mish": (nn.Mish, {"inplace": True}), 21 | } 22 | 23 | 24 | def build_activation(act_func_name: Union[str, nn.Module], **kwargs) -> Optional[nn.Module]: 25 | if isinstance(act_func_name, nn.Module): 26 | return act_func_name 27 | if act_func_name in REGISTERED_ACT_DICT: 28 | act_module, default_args = REGISTERED_ACT_DICT[act_func_name] 29 | for key in default_args: 30 | if key in kwargs: 31 | default_args[key] = kwargs[key] 32 | return act_module(**default_args) 33 | elif act_func_name is None or act_func_name.lower() == "none": 34 | return None 35 | else: 36 | raise ValueError("do not support: %s" % act_func_name) 37 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/backbone_with_fpn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from detectron2.layers import ShapeSpec 3 | 4 | from tridet.modeling.backbone.omni_scripts.utils import make_divisible 5 | from tridet.modeling.backbone.omni_scripts.fused_mb_nets import MixFusedMobileNetV2 6 | from tridet.modeling.backbone.omni_scripts.fpn import FPN 7 | from tridet.modeling.backbone.omni_scripts.ops import ConvLayer 8 | 9 | __all__ = ["BackboneFPN", "build_feature_extractor_all_fuse"] 10 | 11 | 12 | def build_feature_extractor_all_fuse(return_list=False, width_mult=1.0, depth_mult=1.0): 13 | 14 | stage_width_list = [32, 16, 32, 56, 104, 120, 320] 15 | depth_list = [2, 2, 5, 4, 3] 16 | for i, width in enumerate(stage_width_list): 17 | stage_width_list[i] = make_divisible(width * width_mult, 8) 18 | for i, depth in enumerate(depth_list): 19 | depth_list[i] = int(depth * depth_mult) 20 | backbone = MixFusedMobileNetV2( 21 | width_mult=1.0, 22 | ks=[3, 3, 3, 3, 3], 23 | expand_ratio=[4, 4, 4, 4, 4], 24 | depth=depth_list, # 2, 3, 5, 5, 5 25 | block_type_list=["fmb", "fmb", "fmb", "fmb", "fmb", "fmb"], 26 | stage_width_list=stage_width_list, 27 | channel_att_list=[None, None, None, None, None], 28 | act_func="relu", 29 | ) 30 | fpn_width_mult = 0.7 31 | output_width = make_divisible(128 * width_mult, 8) 32 | 33 | fpn = FPN( 34 | inputs=[ 35 | ("ex_stage2", stage_width_list[-1], make_divisible(stage_width_list[-1] * fpn_width_mult, 8), 128), 36 | ("ex_stage1", stage_width_list[-1], make_divisible(stage_width_list[-1] * fpn_width_mult, 8), 64), 37 | ("stage5", stage_width_list[-1], make_divisible(stage_width_list[-1] * fpn_width_mult, 8), 32), 38 | ("stage4", stage_width_list[-2], make_divisible(stage_width_list[-2] * fpn_width_mult, 8), 16), 39 | ("stage2", stage_width_list[-4], make_divisible(stage_width_list[-4] * fpn_width_mult, 8), 8), 40 | ], 41 | input_mode="cat_conv", 42 | middle_config={ 43 | "all": ["fmb_e@4_k@3", "fmb_e@4_k@3"], 44 | 8: ["fmb_e@4_k@3"], 45 | }, 46 | channel_att=None, 47 | prefix="fpn", 48 | act_func="relu", 49 | spp_size=[3, 5, 7], 50 | use_pan=True, 51 | output_width=output_width, 52 | ) 53 | model = BackboneFPN( 54 | backbone, fpn, 55 | n_extra_stage=2, last_channels=stage_width_list[-1], act_func="relu", return_list=return_list, 56 | ) 57 | return model 58 | 59 | 60 | class BackboneFPN(nn.Module): 61 | def __init__( 62 | self, backbone: nn.Module, fpn: FPN, last_channels: int, act_func="relu", n_extra_stage=0, 63 | return_list=False, 64 | ): 65 | super(BackboneFPN, self).__init__() 66 | self.backbone = backbone 67 | self.fpn = fpn 68 | self.extra_stage = nn.ModuleList([ 69 | # PoolingLayer("avg", kernel_size=2, stride=2) 70 | ConvLayer(last_channels, last_channels, 3, 2, act_func=act_func) 71 | # FusedMBV2Block(last_channels, last_channels, 3, 2, expand_ratio=4, act_func=(act_func, None)) 72 | for _ in range(n_extra_stage) 73 | ]) 74 | self.return_list = return_list 75 | 76 | @property 77 | def n_extra_stage(self): 78 | return len(self.extra_stage) 79 | 80 | @property 81 | def size_divisibility(self): 82 | return 32 * (2 ** self.n_extra_stage) 83 | 84 | def output_shape(self): 85 | out_list = [] 86 | for i, (key, in_channel, mid_channel, stride) in enumerate(self.fpn.inputs): 87 | channels = self.fpn.output_width or mid_channel 88 | out_list.append((f"{self.fpn.prefix}_out{i + 1}", ShapeSpec(channels=channels, stride=stride))) 89 | out_list = out_list[::-1] 90 | out_dict = {} 91 | for key, shape in out_list: 92 | out_dict[key] = shape 93 | return out_dict 94 | 95 | def forward(self, x): 96 | feed_dict = self.backbone(x) 97 | x = feed_dict["output"] 98 | for i, extra_stage in enumerate(self.extra_stage): 99 | feed_dict[f"ex_stage{i + 1}"] = x = extra_stage(x) 100 | feed_dict = self.fpn(feed_dict) 101 | if self.return_list: 102 | out_list = [feed_dict[key] for key in self.output_shape()] 103 | return out_list 104 | else: 105 | return feed_dict 106 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/fpn.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | from tridet.modeling.backbone.omni_scripts.utils import make_divisible 6 | from tridet.modeling.backbone.omni_scripts.ops import ( 7 | ConvLayer, PoolingLayer, SPPBlock, UpSampleLayer, MBV2Block, FusedMBV2Block, ResidualBlock, DAGOp 8 | ) 9 | 10 | __all__ = ["FPN"] 11 | 12 | 13 | def build_block( 14 | block_str: str, in_channels: int, out_channels: int, channel_att: Optional[str], act_func: str 15 | ) -> nn.Module: 16 | block_config = {"e": 4, "k": 3} 17 | block_str = block_str.split("_") 18 | block_config["name"] = block_str[0] 19 | for hparam in block_str[1:]: 20 | if hparam.startswith("e@"): 21 | block_config["e"] = float(hparam[2:]) 22 | elif hparam.startswith("k@"): 23 | block_config["k"] = int(hparam[2:]) 24 | 25 | mid_channels = make_divisible(in_channels * block_config["e"], 8) 26 | if channel_att is not None: 27 | raise NotImplementedError 28 | else: 29 | channel_att = None 30 | 31 | if block_config["name"] == "mb": 32 | block = MBV2Block( 33 | in_channels, 34 | out_channels, 35 | block_config["k"], 36 | expand_ratio=block_config["e"], 37 | act_func=(act_func, act_func, None), 38 | ) 39 | if channel_att is not None: 40 | block = nn.Sequential( 41 | block.inverted_conv, 42 | block.depth_conv, 43 | channel_att, 44 | block.point_conv 45 | ) 46 | elif block_config["name"] == "fmb": 47 | block = FusedMBV2Block( 48 | in_channels, 49 | out_channels, 50 | block_config["k"], 51 | expand_ratio=block_config["e"], 52 | act_func=(act_func, None), 53 | ) 54 | if channel_att is not None: 55 | block = nn.Sequential( 56 | block.spatial_conv, 57 | channel_att, 58 | block.point_conv, 59 | ) 60 | else: 61 | raise NotImplementedError 62 | 63 | if in_channels == out_channels: 64 | block = ResidualBlock( 65 | block, 66 | nn.Identity(), 67 | ) 68 | return block 69 | 70 | 71 | class FPN(nn.Module): 72 | """Vanilla FPN and PAN""" 73 | 74 | def __init__( 75 | self, 76 | # inputs 77 | inputs: List[Tuple[str, int, int, int]], 78 | input_mode="cat_conv", 79 | # middle 80 | middle_config: Optional[Dict] = None, 81 | channel_att: Optional[str] = None, 82 | # general 83 | prefix="fpn", 84 | act_func="relu", 85 | spp_size: Optional[List] = None, 86 | use_pan=True, 87 | output_width: Optional[int] = None, 88 | ): 89 | super(FPN, self).__init__() 90 | middle_config = middle_config or {} 91 | if "all" not in middle_config: 92 | middle_config["all"] = ["mbv2_e@4_k@5", "mbv2_e@4_k@5"] 93 | 94 | # sort inputs by stride 95 | inputs = sorted(inputs, key=lambda tup: tup[-1], reverse=True) 96 | self.inputs = inputs 97 | self.prefix = prefix 98 | self.output_width = output_width 99 | 100 | blocks = [] 101 | extra_input = [] 102 | for idx, (feature_id, in_channels, mid_channels, stride) in enumerate(inputs): 103 | # inputs 104 | dag_inputs, dag_merge_mode, dag_post_input_op = self.build_input( 105 | feature_id, 106 | in_channels, 107 | extra_input, 108 | input_mode, 109 | mid_channels, 110 | act_func, 111 | ) 112 | # middle 113 | dag_middle_blocks = [] 114 | if idx == 0 and spp_size is not None: 115 | spp_block = ResidualBlock( 116 | SPPBlock( 117 | mid_channels, 118 | pool_size=spp_size, 119 | pool_type="avg", 120 | act_func=act_func, 121 | ), 122 | nn.Identity(), 123 | ) 124 | dag_middle_blocks.append(spp_block) 125 | for block_str in middle_config.get(stride, middle_config["all"]): 126 | dag_middle_blocks.append( 127 | build_block( 128 | block_str, 129 | mid_channels, 130 | mid_channels, 131 | channel_att, 132 | act_func, 133 | ) 134 | ) 135 | # output 136 | if use_pan or self.output_width is None: 137 | output_module = nn.Identity() 138 | else: 139 | output_module = ConvLayer( 140 | mid_channels, 141 | self.output_width, 142 | 1, 143 | act_func=act_func, 144 | ) 145 | dag_outputs = { 146 | f"{prefix}_{'inner' if use_pan else 'out'}{idx + 1}": output_module 147 | } 148 | if idx < len(inputs) - 1: 149 | up_factor = stride // inputs[idx + 1][3] 150 | dag_outputs[f"{prefix}_up{idx + 1}"] = nn.Sequential( 151 | ConvLayer( 152 | mid_channels, inputs[idx + 1][2], 1, act_func=act_func 153 | ), 154 | UpSampleLayer( 155 | factor=up_factor, 156 | mode="bilinear", 157 | align_corners=False, 158 | ) 159 | if up_factor > 1 160 | else None, 161 | ) 162 | extra_input = [(f"{prefix}_up{idx + 1}", inputs[idx + 1][2])] 163 | 164 | blocks.append( 165 | DAGOp( 166 | inputs=dag_inputs, 167 | merge_mode=dag_merge_mode, 168 | post_input_op=dag_post_input_op, 169 | middle=nn.Sequential(*dag_middle_blocks), 170 | outputs=dag_outputs, 171 | ) 172 | ) 173 | if use_pan: 174 | for idx in range(len(inputs) - 1, -1, -1): 175 | _, _, mid_channels, stride = inputs[idx] 176 | if idx < len(inputs) - 1: 177 | extra_input = [(f"{prefix}_down{idx + 1}", mid_channels)] 178 | else: 179 | extra_input = [] 180 | dag_inputs, dag_merge_mode, dag_post_input_op = self.build_input( 181 | f"{prefix}_inner{idx + 1}", 182 | mid_channels, 183 | extra_input, 184 | input_mode, 185 | mid_channels, 186 | act_func, 187 | ) 188 | # middle 189 | dag_middle_blocks = [] 190 | for block_str in middle_config.get(stride, middle_config["all"]): 191 | dag_middle_blocks.append( 192 | build_block( 193 | block_str, 194 | mid_channels, 195 | mid_channels, 196 | channel_att, 197 | act_func, 198 | ) 199 | ) 200 | # output 201 | if self.output_width is None: 202 | output_module = nn.Identity() 203 | else: 204 | output_module = ConvLayer( 205 | mid_channels, 206 | self.output_width, 207 | 1, 208 | act_func=act_func, 209 | ) 210 | dag_outputs = {f"{prefix}_out{idx + 1}": output_module} 211 | if idx != 0: 212 | down_factor = inputs[idx - 1][3] // stride 213 | downsample = PoolingLayer( 214 | pool_type="avg", 215 | kernel_size=down_factor, 216 | stride=down_factor, 217 | ) 218 | dag_outputs[f"{prefix}_down{idx}"] = nn.Sequential( 219 | downsample, 220 | ConvLayer( 221 | mid_channels, inputs[idx - 1][2], 1, act_func=act_func, 222 | ), 223 | ) 224 | blocks.append( 225 | DAGOp( 226 | inputs=dag_inputs, 227 | merge_mode=dag_merge_mode, 228 | post_input_op=dag_post_input_op, 229 | middle=nn.Sequential(*dag_middle_blocks), 230 | outputs=dag_outputs, 231 | ) 232 | ) 233 | 234 | self.blocks = nn.ModuleList(blocks) 235 | 236 | @staticmethod 237 | def build_input( 238 | feature_id: str, 239 | in_channels: int, 240 | extra_input: List[Tuple[str, int]], 241 | input_mode: str, 242 | mid_channels: int, 243 | act_func: str, 244 | ) -> Tuple[Dict[str, nn.Module], str, Optional[nn.Module]]: 245 | if input_mode == "cat_conv": 246 | merge_mode = "cat" 247 | inputs = {feature_id: nn.Identity()} 248 | for extra_id, extra_in_channels in extra_input: 249 | inputs[extra_id] = nn.Identity() 250 | post_input_op = ConvLayer( 251 | in_channels=sum([in_channels] + [extra_c for _, extra_c in extra_input]), 252 | out_channels=mid_channels, 253 | kernel_size=1, 254 | act_func=act_func, 255 | ) 256 | elif input_mode == "add": 257 | merge_mode = "add" 258 | inputs = {feature_id: nn.Identity()} 259 | for extra_id, extra_in_channels in extra_input: 260 | inputs[extra_id] = nn.Identity() 261 | post_input_op = None 262 | else: 263 | raise NotImplementedError 264 | return inputs, merge_mode, post_input_op 265 | 266 | def forward(self, feature_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: 267 | for block in self.blocks: 268 | feature_dict = block(feature_dict) 269 | return feature_dict 270 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/fused_mb_nets.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | import torch.nn as nn 3 | 4 | from tridet.modeling.backbone.omni_scripts.utils import make_divisible, val2list 5 | from tridet.modeling.backbone.omni_scripts.ops import ConvLayer, MBV1Block, MBV2Block, FusedMBV2Block, ResidualBlock, SeqBackbone 6 | 7 | __all__ = ["MixFusedMobileNetV2"] 8 | 9 | 10 | class MixFusedMobileNetV2(SeqBackbone): 11 | def __init__( 12 | self, 13 | width_mult=1.0, 14 | channel_divisor=8, 15 | ks: Union[int, List[int], None] = None, 16 | expand_ratio: Union[int, List[int], None] = None, 17 | depth: Union[int, List[int], None] = None, 18 | stage_width_list: Optional[List[int]] = None, 19 | act_func=None, 20 | block_type_list: Optional[List[str]] = None, 21 | channel_att_list: Union[None, str, List[Optional[str]]] = None, 22 | ): 23 | 24 | ks = val2list(ks or 3, 5) 25 | expand_ratio = val2list(expand_ratio or 6, 5) 26 | depth = val2list(depth, 5) 27 | act_func = act_func or "relu" 28 | block_type_list = block_type_list or ["fmb", "fmb", "fmb", "mb", "mb", "mb"] 29 | channel_att_list = val2list(channel_att_list, 5) 30 | 31 | block_configs = [ 32 | # t, n, s 33 | [expand_ratio[0], depth[0] or 2, ks[0], 2], 34 | [expand_ratio[1], depth[1] or 3, ks[1], 2], 35 | [expand_ratio[2], depth[2] or 4, ks[2], 2], 36 | [expand_ratio[3], depth[3] or 3, ks[3], 1], 37 | [expand_ratio[4], depth[4] or 3, ks[4], 2], 38 | ] 39 | 40 | stage_width_list = stage_width_list or [32, 16, 24, 32, 64, 96, 160] 41 | for i, w in enumerate(stage_width_list): 42 | stage_width_list[i] = make_divisible(w * width_mult, channel_divisor) 43 | 44 | # input stem 45 | input_stem = nn.Sequential( 46 | ConvLayer(3, stage_width_list[0], 3, 2, act_func=act_func, first_layer=True), 47 | (FusedMBV2Block if block_type_list[0] == "fmb" else MBV1Block)( 48 | stage_width_list[0], 49 | stage_width_list[1], 50 | kernel_size=3, 51 | stride=1, 52 | act_func=(act_func, None), 53 | **({"expand_ratio": 1} if block_type_list[0] == "fmb" else {}), 54 | ), 55 | ) 56 | 57 | # stages 58 | stages = [] 59 | in_channels = stage_width_list[1] 60 | for (t, n, k, s), c, block_type, channel_att_type in zip( 61 | block_configs, stage_width_list[2:], block_type_list[1:], channel_att_list, 62 | ): 63 | blocks = [] 64 | for i in range(n): 65 | stride = s if i == 0 else 1 66 | mb_conv = (FusedMBV2Block if block_type == "fmb" else MBV2Block)( 67 | in_channels, 68 | c, 69 | k, 70 | stride, 71 | expand_ratio=t, 72 | act_func=(act_func, None) if block_type == "fmb" else (act_func, act_func, None), 73 | ) 74 | if channel_att_type is None: 75 | channel_att = None 76 | elif channel_att_type.startswith("se"): 77 | raise NotImplementedError 78 | elif channel_att_type.startswith("ca"): 79 | raise NotImplementedError 80 | else: 81 | channel_att = None 82 | if channel_att is not None: 83 | if isinstance(mb_conv, FusedMBV2Block): 84 | mb_conv = nn.Sequential( 85 | mb_conv.spatial_conv, 86 | channel_att, 87 | mb_conv.point_conv, 88 | ) 89 | else: 90 | mb_conv = nn.Sequential( 91 | mb_conv.inverted_conv, 92 | mb_conv.depth_conv, 93 | channel_att, 94 | mb_conv.point_conv 95 | ) 96 | if i != 0: 97 | mb_conv = ResidualBlock( 98 | mb_conv, 99 | nn.Identity(), 100 | ) 101 | blocks.append(mb_conv) 102 | in_channels = c 103 | stages.append(nn.Sequential(*blocks)) 104 | super(MixFusedMobileNetV2, self).__init__(input_stem, stages) 105 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/norm.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple, Type 2 | 3 | import torch.nn as nn 4 | 5 | __all__ = ["REGISTERED_NORMALIZATION_DICT", "build_norm"] 6 | 7 | # register normalization function here 8 | # name: module, kwargs with default values 9 | REGISTERED_NORMALIZATION_DICT: Dict[str, Tuple[Type, Dict[str, Any]]] = { 10 | "bn_3d": (nn.BatchNorm3d, {"num_features": None, "eps": 1e-5, "momentum": 0.1}), 11 | "bn_2d": (nn.BatchNorm2d, {"num_features": None, "eps": 1e-5, "momentum": 0.1}), 12 | "bn_1d": (nn.BatchNorm1d, {"num_features": None, "eps": 1e-5, "momentum": 0.1}), 13 | "sync_bn": (nn.SyncBatchNorm, {"num_features": None, "eps": 1e-5, "momentum": 0.1}), 14 | "gn": (nn.GroupNorm, {"num_groups": None, "num_channels": None, "eps": 1e-5}), 15 | "ln": (nn.LayerNorm, {"normalized_shape": None, "eps": 1e-5}), 16 | } 17 | 18 | 19 | def build_norm(norm_name="bn_2d", num_features=None, **kwargs) -> Optional[nn.Module]: 20 | if norm_name == "gn": 21 | kwargs["num_channels"] = num_features 22 | elif norm_name == "ln": 23 | kwargs["normalized_shape"] = num_features 24 | else: 25 | kwargs["num_features"] = num_features 26 | if norm_name in REGISTERED_NORMALIZATION_DICT: 27 | norm_module, default_args = REGISTERED_NORMALIZATION_DICT[norm_name] 28 | for key in default_args: 29 | if key in kwargs: 30 | default_args[key] = kwargs[key] 31 | return norm_module(**default_args) 32 | elif norm_name is None or norm_name.lower() == "none": 33 | return None 34 | else: 35 | raise ValueError("do not support: %s" % norm_name) 36 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/omninet_w1.0.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tridet.modeling.backbone.omni_scripts.backbone_with_fpn import build_feature_extractor_all_fuse 3 | 4 | omninet_w10 = build_feature_extractor_all_fuse( 5 | return_list=False, width_mult=1.0, depth_mult=1.0, 6 | ) 7 | 8 | checkpoint = torch.load( 9 | "omninet-small", 10 | map_location="cpu" 11 | ) 12 | checkpoint = checkpoint["state_dict"] 13 | omninet_w10.load_state_dict(checkpoint) 14 | print(omninet_w10) 15 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/omninet_w1.3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tridet.modeling.backbone.omni_scripts.backbone_with_fpn import build_feature_extractor_all_fuse 3 | 4 | omninet_w13 = build_feature_extractor_all_fuse( 5 | return_list=False, width_mult=1.3, depth_mult=1.0, 6 | ) 7 | 8 | checkpoint = torch.load( 9 | "omninet-big", 10 | map_location="cpu" 11 | ) 12 | checkpoint = checkpoint["state_dict"] 13 | omninet_w13.load_state_dict(checkpoint) 14 | print(omninet_w13) 15 | -------------------------------------------------------------------------------- /tridet/modeling/backbone/omni_scripts/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List, Tuple, Any, Optional 2 | 3 | __all__ = ["list_sum", "val2list", "squeeze_list", "make_divisible", "get_same_padding"] 4 | 5 | 6 | def list_sum(x: List) -> Any: 7 | """Return the sum of a list of objects. 8 | 9 | can be int, float, torch.Tensor, np.ndarray, etc 10 | can be used for adding losses 11 | """ 12 | return x[0] if len(x) == 1 else x[0] + list_sum(x[1:]) 13 | 14 | 15 | def val2list(val: Union[List, Tuple, Any], repeat_time=1) -> List: 16 | """Repeat `val` for `repeat_time` times and return the list or val if list/tuple.""" 17 | if isinstance(val, (list, tuple)): 18 | return list(val) 19 | return [val for _ in range(repeat_time)] 20 | 21 | 22 | def squeeze_list(src_list: Optional[List]) -> Union[List, Any]: 23 | """Return the first item of the given list if the list only contains one item. 24 | 25 | usually used in args parsing 26 | """ 27 | if src_list is not None and len(src_list) == 1: 28 | return src_list[0] 29 | else: 30 | return src_list 31 | 32 | 33 | def make_divisible(v: Union[int, float], divisor: Optional[int], min_val=None) -> Union[int, float]: 34 | """This function is taken from the original tf repo. 35 | 36 | It ensures that all layers have a channel number that is divisible by 8 37 | It can be seen here: 38 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 39 | :param v: 40 | :param divisor: 41 | :param min_val: 42 | :return: 43 | """ 44 | if divisor is None: 45 | return v 46 | 47 | if min_val is None: 48 | min_val = divisor 49 | new_v = max(min_val, int(v + divisor / 2) // divisor * divisor) 50 | # Make sure that round down does not go down by more than 10%. 51 | if new_v < 0.9 * v: 52 | new_v += divisor 53 | return new_v 54 | 55 | 56 | def get_same_padding(kernel_size: Union[int, Tuple[int, int]]) -> Union[int, tuple]: 57 | if isinstance(kernel_size, tuple): 58 | assert len(kernel_size) == 2, f"invalid kernel size: {kernel_size}" 59 | p1 = get_same_padding(kernel_size[0]) 60 | p2 = get_same_padding(kernel_size[1]) 61 | return p1, p2 62 | else: 63 | assert isinstance(kernel_size, int), "kernel size should be either `int` or `tuple`" 64 | assert kernel_size % 2 > 0, "kernel size should be odd number" 65 | return kernel_size // 2 66 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.modeling.dd3d.core import DD3D 3 | from tridet.modeling.dd3d.nuscenes_dd3d import NuscenesDD3D 4 | from tridet.modeling.dd3d.nuscenes_dd3d_tta import NuscenesDD3DWithTTA 5 | from tridet.modeling.dd3d.test_time_augmentation import DD3DWithTTA 6 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 6 | from detectron2.modeling.postprocessing import detector_postprocess as resize_instances 7 | from detectron2.structures import Instances 8 | 9 | from tridet.modeling.dd3d.fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss 10 | from tridet.modeling.dd3d.fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss 11 | from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate 12 | from tridet.modeling.dd3d.prepare_targets import DD3DTargetPreparer 13 | from tridet.modeling.feature_extractor import build_feature_extractor 14 | from tridet.structures.image_list import ImageList 15 | from tridet.utils.tensor2d import compute_features_locations as compute_locations_per_level 16 | from tridet.modeling.backbone.omni_scripts.backbone_with_fpn import build_feature_extractor_all_fuse 17 | 18 | 19 | @META_ARCH_REGISTRY.register() 20 | class DD3D(nn.Module): 21 | def __init__(self, cfg): 22 | super().__init__() 23 | if "backbone_with_fpn" in cfg.MODEL: 24 | self.backbone = build_feature_extractor_all_fuse( 25 | width_mult=cfg.MODEL.width_mult, depth_mult=cfg.MODEL.depth_mult, 26 | ) 27 | else: 28 | self.backbone = build_feature_extractor(cfg) 29 | 30 | backbone_output_shape = self.backbone.output_shape() 31 | self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys()) 32 | self.backbone_output_shape = [backbone_output_shape[f] for f in self.in_features] 33 | 34 | self.feature_locations_offset = cfg.DD3D.FEATURE_LOCATIONS_OFFSET 35 | 36 | self.fcos2d_head = FCOS2DHead(cfg, self.backbone_output_shape) 37 | self.fcos2d_loss = FCOS2DLoss(cfg) 38 | self.fcos2d_inference = FCOS2DInference(cfg) 39 | 40 | if cfg.MODEL.BOX3D_ON: 41 | self.fcos3d_head = FCOS3DHead(cfg, self.backbone_output_shape) 42 | self.fcos3d_loss = FCOS3DLoss(cfg) 43 | self.fcos3d_inference = FCOS3DInference(cfg) 44 | self.only_box2d = False 45 | else: 46 | self.only_box2d = True 47 | 48 | self.prepare_targets = DD3DTargetPreparer(cfg, self.backbone_output_shape) 49 | 50 | self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS 51 | 52 | self.do_nms = cfg.DD3D.INFERENCE.DO_NMS 53 | self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS 54 | self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH 55 | 56 | # nuScenes inference aggregates detections over all 6 cameras. 57 | self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE 58 | self.num_classes = cfg.DD3D.NUM_CLASSES 59 | 60 | self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) 61 | self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) 62 | 63 | @property 64 | def device(self): 65 | return self.pixel_mean.device 66 | 67 | def preprocess_image(self, x): 68 | return (x - self.pixel_mean) / self.pixel_std 69 | 70 | def forward(self, batched_inputs): 71 | images = [x["image"].to(self.device) for x in batched_inputs] 72 | images = [self.preprocess_image(x) for x in images] 73 | 74 | if 'intrinsics' in batched_inputs[0]: 75 | intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs] 76 | else: 77 | intrinsics = None 78 | images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics) 79 | 80 | gt_dense_depth = None 81 | if 'depth' in batched_inputs[0]: 82 | gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs] 83 | gt_dense_depth = ImageList.from_tensors( 84 | gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics 85 | ) 86 | 87 | features = self.backbone(images.tensor) 88 | features = [features[f] for f in self.in_features] 89 | 90 | if "instances" in batched_inputs[0]: 91 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 92 | else: 93 | gt_instances = None 94 | 95 | locations = self.compute_locations(features) 96 | logits, box2d_reg, centerness, _ = self.fcos2d_head(features) 97 | if not self.only_box2d: 98 | box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features) 99 | inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None 100 | 101 | if self.training: 102 | assert gt_instances is not None 103 | feature_shapes = [x.shape[-2:] for x in features] 104 | training_targets = self.prepare_targets(locations, gt_instances, feature_shapes) 105 | if gt_dense_depth is not None: 106 | training_targets.update({"dense_depth": gt_dense_depth}) 107 | 108 | losses = {} 109 | fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets) 110 | losses.update(fcos2d_loss) 111 | 112 | if not self.only_box2d: 113 | fcos3d_loss = self.fcos3d_loss( 114 | box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, 115 | fcos2d_info, training_targets 116 | ) 117 | losses.update(fcos3d_loss) 118 | return losses 119 | else: 120 | pred_instances, fcos2d_info = self.fcos2d_inference( 121 | logits, box2d_reg, centerness, locations, images.image_sizes 122 | ) 123 | if not self.only_box2d: 124 | # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place. 125 | self.fcos3d_inference( 126 | box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, 127 | fcos2d_info 128 | ) 129 | 130 | # 3D score == 2D score x confidence. 131 | score_key = "scores_3d" 132 | else: 133 | score_key = "scores" 134 | 135 | # Transpose to "image-first", i.e. (B, L) 136 | pred_instances = list(zip(*pred_instances)) 137 | pred_instances = [Instances.cat(instances) for instances in pred_instances] 138 | 139 | # 2D NMS and pick top-K. 140 | if self.do_nms: 141 | pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key) 142 | 143 | if not self.only_box2d and self.do_bev_nms: 144 | # Bird-eye-view NMS. 145 | dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)} 146 | if 'pose' in batched_inputs[0]: 147 | poses = [x['pose'] for x in batched_inputs] 148 | else: 149 | poses = [x['extrinsics'] for x in batched_inputs] 150 | pred_instances = nuscenes_sample_aggregate( 151 | pred_instances, 152 | dummy_group_idxs, 153 | self.num_classes, 154 | poses, 155 | iou_threshold=self.bev_nms_iou_thresh, 156 | include_boxes3d_global=False 157 | ) 158 | 159 | if self.postprocess_in_inference: 160 | processed_results = [] 161 | for results_per_image, input_per_image, image_size in \ 162 | zip(pred_instances, batched_inputs, images.image_sizes): 163 | height = input_per_image.get("height", image_size[0]) 164 | width = input_per_image.get("width", image_size[1]) 165 | r = resize_instances(results_per_image, height, width) 166 | processed_results.append({"instances": r}) 167 | else: 168 | processed_results = [{"instances": x} for x in pred_instances] 169 | 170 | return processed_results 171 | 172 | def compute_locations(self, features): 173 | locations = [] 174 | in_strides = [x.stride for x in self.backbone_output_shape] 175 | for level, feature in enumerate(features): 176 | h, w = feature.size()[-2:] 177 | locations_per_level = compute_locations_per_level( 178 | h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset 179 | ) 180 | locations.append(locations_per_level) 181 | return locations 182 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/dense_depth.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | from detectron2.layers import Conv2d, get_norm 8 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 9 | 10 | from tridet.layers.normalization import ModuleListDial, Offset, Scale 11 | from tridet.modeling.dd3d.dense_depth_loss import build_dense_depth_loss 12 | from tridet.modeling.feature_extractor import build_feature_extractor 13 | from tridet.structures.image_list import ImageList 14 | from tridet.utils.tensor2d import aligned_bilinear 15 | 16 | 17 | class DD3DDenseDepthHead(nn.Module): 18 | def __init__(self, cfg, input_shape): 19 | super().__init__() 20 | self.in_strides = [shape.stride for shape in input_shape] 21 | self.num_levels = len(input_shape) 22 | assert self.in_strides == [shape.stride for shape in input_shape] 23 | 24 | self.mean_depth_per_level = torch.FloatTensor(cfg.DD3D.FCOS3D.MEAN_DEPTH_PER_LEVEL) 25 | self.std_depth_per_level = torch.FloatTensor(cfg.DD3D.FCOS3D.STD_DEPTH_PER_LEVEL) 26 | 27 | self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR 28 | 29 | self.use_scale = cfg.DD3D.FCOS3D.USE_SCALE 30 | self.depth_scale_init_factor = cfg.DD3D.FCOS3D.DEPTH_SCALE_INIT_FACTOR 31 | 32 | box3d_tower = [] 33 | in_channels = input_shape[0].channels 34 | 35 | num_convs = cfg.DD3D.FCOS3D.NUM_CONVS 36 | use_deformable = cfg.DD3D.FCOS3D.USE_DEFORMABLE 37 | norm = cfg.DD3D.FCOS3D.NORM 38 | 39 | if use_deformable: 40 | raise ValueError("Not supported yet.") 41 | 42 | for i in range(num_convs): 43 | if norm in ("BN", "FrozenBN"): 44 | # Each FPN level has its own batchnorm layer. 45 | # "BN" is converted to "SyncBN" in distributed training (see train.py) 46 | norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)]) 47 | else: 48 | norm_layer = get_norm(norm, in_channels) 49 | box3d_tower.append( 50 | Conv2d( 51 | in_channels, 52 | in_channels, 53 | kernel_size=3, 54 | stride=1, 55 | padding=1, 56 | bias=norm_layer is None, 57 | norm=norm_layer, 58 | activation=F.relu 59 | ) 60 | ) 61 | self.add_module('box3d_tower', nn.Sequential(*box3d_tower)) 62 | 63 | # Each FPN level has its own predictor layer. 64 | self.dense_depth = nn.ModuleList([ 65 | Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=(not cfg.DD3D.FCOS3D.USE_SCALE)) 66 | for _ in range(self.num_levels) 67 | ]) 68 | 69 | if self.use_scale: 70 | self.scales_depth = nn.ModuleList([ 71 | Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level 72 | ]) 73 | self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level]) 74 | 75 | self._init_weights() 76 | 77 | def _init_weights(self): 78 | 79 | for l in self.box3d_tower.modules(): 80 | if isinstance(l, nn.Conv2d): 81 | torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu') 82 | if l.bias is not None: 83 | torch.nn.init.constant_(l.bias, 0) 84 | 85 | for l in self.dense_depth.modules(): 86 | if isinstance(l, nn.Conv2d): 87 | torch.nn.init.kaiming_uniform_(l.weight, a=1) 88 | if l.bias is not None: # depth head may not have bias. 89 | torch.nn.init.constant_(l.bias, 0) 90 | 91 | def forward(self, x): 92 | assert len(x) == self.num_levels 93 | dense_depth = [] 94 | for l, features in enumerate(x): 95 | box3d_tower_out = self.box3d_tower(features) 96 | dense_depth_lvl = self.dense_depth[l](box3d_tower_out) 97 | if self.use_scale: 98 | dense_depth_lvl = self.offsets_depth[l](self.scales_depth[l](dense_depth_lvl)) 99 | dense_depth.append(dense_depth_lvl) 100 | return dense_depth 101 | 102 | 103 | @META_ARCH_REGISTRY.register() 104 | class DD3DDenseDepth(nn.Module): 105 | def __init__(self, cfg): 106 | super().__init__() 107 | self.in_features = cfg.DD3D.IN_FEATURES 108 | self.feature_locations_offset = cfg.DD3D.FEATURE_LOCATIONS_OFFSET 109 | 110 | self.backbone = build_feature_extractor(cfg) 111 | backbone_output_shape = self.backbone.output_shape() 112 | backbone_output_shape = [backbone_output_shape[f] for f in self.in_features] 113 | 114 | self.fcos3d_head = DD3DDenseDepthHead(cfg, backbone_output_shape) 115 | self.depth_loss = build_dense_depth_loss(cfg) 116 | 117 | self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS 118 | self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR 119 | 120 | self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) 121 | self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) 122 | 123 | @property 124 | def device(self): 125 | return self.pixel_mean.device 126 | 127 | def preprocess_image(self, x): 128 | return (x - self.pixel_mean) / self.pixel_std 129 | 130 | def forward(self, batched_inputs): 131 | images = [x["image"].to(self.device) for x in batched_inputs] 132 | images = [self.preprocess_image(x) for x in images] 133 | 134 | if 'intrinsics' in batched_inputs[0]: 135 | intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs] 136 | else: 137 | intrinsics = None 138 | images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics) 139 | 140 | gt_dense_depth = None 141 | if 'depth' in batched_inputs[0]: 142 | gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs] 143 | gt_dense_depth = ImageList.from_tensors( 144 | gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics 145 | ) 146 | 147 | features = self.backbone(images.tensor) 148 | features = [features[f] for f in self.in_features] 149 | dense_depth = self.fcos3d_head(features) 150 | 151 | inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None 152 | 153 | # Upsample. 154 | dense_depth = [ 155 | aligned_bilinear(x, factor=stride, offset=self.feature_locations_offset).squeeze(1) 156 | for x, stride in zip(dense_depth, self.in_strides) 157 | ] 158 | 159 | if self.scale_depth_by_focal_lengths: 160 | assert inv_intrinsics is not None 161 | pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1) 162 | scaled_pixel_size = (pixel_size * self.scale_depth_by_focal_lengths_factor).reshape(-1, 1, 1) 163 | dense_depth = [x / scaled_pixel_size for x in dense_depth] 164 | 165 | if self.training: 166 | losses = {} 167 | for lvl, x in enumerate(dense_depth): 168 | loss_lvl = self.depth_loss(x, gt_dense_depth.tensor)["loss_dense_depth"] 169 | loss_lvl = loss_lvl / (np.sqrt(2)**lvl) # Is sqrt(2) good? 170 | losses.update({f"loss_dense_depth_lvl_{lvl}": loss_lvl}) 171 | return losses 172 | else: 173 | raise NotImplementedError() 174 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/dense_depth_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from detectron2.config.config import configurable 6 | 7 | from tridet.layers import smooth_l1_loss 8 | 9 | 10 | class DenseDepthL1Loss(nn.Module): 11 | @configurable 12 | def __init__(self, beta, min_depth=0., max_depth=100., loss_weight=1.0): 13 | super().__init__() 14 | self.beta = beta 15 | self.min_depth = min_depth 16 | self.max_depth = max_depth 17 | self.loss_weight = loss_weight 18 | 19 | @classmethod 20 | def from_config(cls, cfg): 21 | return { 22 | "beta": cfg.DD3D.FCOS3D.LOSS.SMOOTH_L1_BETA, 23 | "min_depth": cfg.DD3D.FCOS3D.MIN_DEPTH, 24 | "max_depth": cfg.DD3D.FCOS3D.MAX_DEPTH, 25 | "loss_weight": cfg.DD3D.FCOS3D.DEPTH_HEAD.LOSS_WEIGHT 26 | } 27 | 28 | def forward(self, depth_pred, depth_gt, masks=None): 29 | M = (depth_gt < self.min_depth).to(torch.float32) + (depth_gt > self.max_depth).to(torch.float32) 30 | if masks is not None: 31 | M += (1. - masks).to(torch.float32) 32 | 33 | M = M == 0. 34 | loss = smooth_l1_loss(depth_pred[M], depth_gt[M], beta=self.beta, reduction='mean') 35 | 36 | return {"loss_dense_depth": self.loss_weight * loss} 37 | 38 | 39 | def build_dense_depth_loss(cfg): 40 | if cfg.DD3D.FCOS3D.DEPTH_HEAD.LOSS_TYPE == "L1": 41 | return DenseDepthL1Loss(cfg) 42 | else: 43 | ValueError(f"Not supported depth loss: {cfg.DD3D.FCOS3D.DEPTH_HEAD.LOSS_TYPE}") 44 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/depth.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | from detectron2.layers import Conv2d 5 | from tridet.utils.geometry import get_pixel_sizes_perspective_cams 6 | from tridet.modeling.dd3d.utils import get_fpn_out_channels 7 | 8 | class PacknetDepthHead(nn.Module): 9 | def __init__( 10 | self, 11 | net, 12 | input_shape, 13 | min_depth, 14 | max_depth, 15 | scale_depth_by_focal_length=None, # NOTE: when use as depth-as-input, disable this and do the scaling online. 16 | ): 17 | super().__init__() 18 | 19 | self.net = net(input_shape=input_shape) 20 | 21 | input_shape = self.net.output_shape() 22 | in_channels = get_fpn_out_channels(input_shape) 23 | 24 | # Predictor 25 | conv_kwargs = dict( 26 | in_channels=in_channels, 27 | out_channels=1, 28 | kernel_size=3, 29 | stride=1, 30 | bias=True, 31 | padding=1, 32 | norm=None, 33 | activation=F.sigmoid 34 | ) 35 | self.predictor = Conv2d(**conv_kwargs) 36 | 37 | self.min_depth = min_depth 38 | self.max_depth = max_depth 39 | self.scale_depth_by_focal_length = scale_depth_by_focal_length 40 | 41 | def forward(self, x, cams): 42 | net_out = self.net(x) 43 | depth = [self.predictor(x) for x in net_out] 44 | 45 | if self.scale_depth_by_focal_length is not None: 46 | pixel_size = get_pixel_sizes_perspective_cams(cams) 47 | depth = [x / (pixel_size * self.scale_depth_by_focal_length).view(-1, 1, 1, 1) for x in depth] 48 | 49 | m, M = self.min_depth, self.max_depth 50 | depth = [(M - m) * x + m for x in depth] 51 | depth = [x.clamp(min=m, max=M) for x in depth] 52 | return {'depth': depth, 'depth_head_net_out': net_out} 53 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/disentangled_box3d_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | 4 | import torch 5 | 6 | from detectron2.config import configurable 7 | 8 | from tridet.layers import smooth_l1_loss 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | class DisentangledBox3DLoss(): 14 | @configurable 15 | def __init__(self, smooth_l1_loss_beta, max_loss_per_group): 16 | self.smooth_l1_loss_beta = smooth_l1_loss_beta 17 | self.max_loss_per_group = max_loss_per_group 18 | 19 | @classmethod 20 | def from_config(cls, cfg): 21 | return { 22 | "smooth_l1_loss_beta": cfg.DD3D.FCOS3D.LOSS.SMOOTH_L1_BETA, 23 | "max_loss_per_group": cfg.DD3D.FCOS3D.LOSS.MAX_LOSS_PER_GROUP_DISENT 24 | } 25 | 26 | def __call__(self, box3d_pred, box3d_targets, locations, weights=None): 27 | 28 | box3d_pred = box3d_pred.to(torch.float32) 29 | box3d_targets = box3d_targets.to(torch.float32) 30 | 31 | target_corners = box3d_targets.corners 32 | 33 | disentangled_losses = {} 34 | for component_key in ["quat", "proj_ctr", "depth", "size"]: 35 | disentangled_boxes = box3d_targets.clone() 36 | setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key)) 37 | pred_corners = disentangled_boxes.to(torch.float32).corners 38 | 39 | loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta) 40 | 41 | # Bound the loss 42 | loss.clamp(max=self.max_loss_per_group) 43 | 44 | if weights is not None: 45 | # loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1)) 46 | loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights) 47 | else: 48 | loss = loss.reshape(-1, 24).mean() 49 | 50 | disentangled_losses["loss_box3d_" + component_key] = loss 51 | 52 | entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1) 53 | 54 | return disentangled_losses, entangled_l1_dist 55 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/nuscenes_dd3d_tta.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import copy 3 | 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch.nn.parallel import DistributedDataParallel 8 | 9 | from detectron2.data.detection_utils import read_image 10 | from detectron2.layers import batched_nms 11 | from detectron2.structures import Boxes, Instances 12 | from detectron2.utils.comm import get_world_size 13 | 14 | from tridet.layers import bev_nms 15 | from tridet.modeling.dd3d.nuscenes_dd3d import NuscenesDD3D 16 | from tridet.modeling.dd3d.postprocessing import get_group_idxs, nuscenes_sample_aggregate 17 | from tridet.modeling.dd3d.test_time_augmentation import DatasetMapperTTA 18 | from tridet.structures.boxes3d import Boxes3D 19 | 20 | 21 | class NuscenesDD3DWithTTA(nn.Module): 22 | def __init__(self, cfg, model, tta_mapper=None): 23 | super().__init__() 24 | if isinstance(model, DistributedDataParallel): 25 | model = model.module 26 | assert isinstance(model, NuscenesDD3D), \ 27 | "NuscenesDD3DWithTTA only supports on NuscenesDD3D. Got a model of type {}".format(type(model)) 28 | 29 | assert not model.postprocess_in_inference, "To use test-time augmentation, `postprocess_in_inference` must be False." 30 | self.cfg = cfg.copy() 31 | 32 | self.model = model 33 | self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH 34 | 35 | if tta_mapper is None: 36 | tta_mapper = DatasetMapperTTA(cfg) 37 | self.tta_mapper = tta_mapper 38 | self.batch_size = cfg.TEST.IMS_PER_BATCH // get_world_size() 39 | 40 | def __call__(self, batched_inputs): 41 | """ 42 | Same input/output format as :meth:`NuscenesDD3D` 43 | """ 44 | def _maybe_read_image(dataset_dict): 45 | ret = copy.copy(dataset_dict) 46 | if "image" not in ret: 47 | image = read_image(ret.pop("file_name"), self.tta_mapper.image_format) 48 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 49 | ret["image"] = image 50 | if "height" not in ret and "width" not in ret: 51 | ret["height"] = image.shape[1] 52 | ret["width"] = image.shape[2] 53 | return ret 54 | 55 | instances_per_image = [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs] 56 | 57 | # ---------------------------------------------------------- 58 | # NuScenes specific: cross-image (i.e. sample-level) BEV NMS. 59 | # ---------------------------------------------------------- 60 | sample_tokens = [x['sample_token'] for x in batched_inputs] 61 | group_idxs = get_group_idxs(sample_tokens, self.model.num_images_per_sample) 62 | global_poses = [x['pose'] for x in batched_inputs] 63 | 64 | filtered_instances = nuscenes_sample_aggregate( 65 | instances_per_image, 66 | group_idxs, 67 | self.model.num_classes, 68 | global_poses, 69 | self.model.bev_nms_iou_thresh, 70 | max_num_dets_per_sample=self.model.max_num_dets_per_sample 71 | ) 72 | 73 | return [{'instances': instances} for instances in filtered_instances] 74 | 75 | def _inference_one_image(self, x): 76 | """ 77 | Args: 78 | x (dict): one dataset dict with "image" field being a CHW tensor 79 | 80 | Returns: 81 | dict: one output dict 82 | """ 83 | orig_shape = (x["height"], x["width"]) 84 | augmented_inputs, tfms = self._get_augmented_inputs(x) 85 | merged_instances = self._get_augmented_instances(augmented_inputs, tfms, orig_shape) 86 | if len(merged_instances) > 0: 87 | if self.model.do_nms: 88 | # Multiclass NMS. 89 | keep = batched_nms( 90 | merged_instances.pred_boxes.tensor, merged_instances.scores_3d, merged_instances.pred_classes, 91 | self.nms_thresh 92 | ) 93 | merged_instances = merged_instances[keep] 94 | 95 | if not self.model.only_box2d and self.model.do_bev_nms > 0: 96 | # Bird-eye-view NMS. 97 | keep = bev_nms( 98 | merged_instances.pred_boxes3d, 99 | merged_instances.scores_3d, 100 | self.model.bev_nms_iou_thresh, 101 | class_idxs=merged_instances.pred_classes, 102 | class_agnostic=False 103 | ) 104 | merged_instances = merged_instances[keep] 105 | 106 | return merged_instances 107 | 108 | def _get_augmented_inputs(self, x): 109 | augmented_inputs = self.tta_mapper(x) 110 | tfms = [x.pop("transforms") for x in augmented_inputs] 111 | return augmented_inputs, tfms 112 | 113 | def _get_augmented_instances(self, augmented_inputs, tfms, orig_shape): 114 | # 1: forward with all augmented images 115 | outputs = self._batch_inference(augmented_inputs) 116 | # 2: union the results 117 | all_boxes = [] 118 | all_boxes3d = [] 119 | 120 | for input, output, tfm in zip(augmented_inputs, outputs, tfms): 121 | # Need to inverse the transforms on boxes, to obtain results on original image 122 | inv_tfm = tfm.inverse() 123 | 124 | # 2D boxes 125 | pred_boxes = output.pred_boxes.tensor 126 | orig_pred_boxes = inv_tfm.apply_box(pred_boxes.cpu().numpy()) 127 | orig_pred_boxes = torch.from_numpy(orig_pred_boxes).to(pred_boxes.device) 128 | all_boxes.append(Boxes(orig_pred_boxes)) 129 | 130 | # 3D boxes 131 | pred_boxes_3d = output.pred_boxes3d 132 | vectorized_boxes_3d = pred_boxes_3d.vectorize().cpu().numpy() 133 | orig_vec_pred_boxes_3d = [inv_tfm.apply_box3d(box3d_as_vec) for box3d_as_vec in vectorized_boxes_3d] 134 | 135 | # intrinsics 136 | orig_intrinsics = inv_tfm.apply_intrinsics(input['intrinsics'].cpu().numpy()) 137 | orig_pred_boxes_3d = Boxes3D.from_vectors( 138 | orig_vec_pred_boxes_3d, orig_intrinsics, device=pred_boxes_3d.device 139 | ) 140 | all_boxes3d.append(orig_pred_boxes_3d) 141 | 142 | all_boxes = Boxes.cat(all_boxes) 143 | all_boxes3d = Boxes3D.cat(all_boxes3d) 144 | 145 | all_scores = torch.cat([x.scores for x in outputs]) 146 | all_scores_3d = torch.cat([x.scores_3d for x in outputs]) 147 | all_classes = torch.cat([x.pred_classes for x in outputs]) 148 | 149 | all_attributes = torch.cat([x.pred_attributes for x in outputs]) 150 | all_speeds = torch.cat([x.pred_speeds for x in outputs]) 151 | 152 | return Instances( 153 | image_size=orig_shape, 154 | pred_boxes=all_boxes, 155 | pred_boxes3d=all_boxes3d, 156 | pred_classes=all_classes, 157 | scores=all_scores, 158 | scores_3d=all_scores_3d, 159 | pred_attributes=all_attributes, 160 | pred_speeds=all_speeds 161 | ) 162 | 163 | def _batch_inference(self, batched_inputs): 164 | """ 165 | Execute inference on a list of inputs, 166 | using batch size = self.batch_size, instead of the length of the list. 167 | 168 | Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` 169 | """ 170 | outputs = [] 171 | inputs = [] 172 | for idx, x in enumerate(batched_inputs): 173 | inputs.append(x) 174 | if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: 175 | # This runs NMS (box and optionally bev) per each augmented image. 176 | outputs.extend([res['instances'] for res in self.model(inputs)]) 177 | inputs = [] 178 | return outputs 179 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/postprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from collections import OrderedDict, defaultdict 3 | from pprint import pprint 4 | 5 | import torch 6 | from pytorch3d.transforms import transform3d as t3d 7 | from pytorch3d.transforms.rotation_conversions import matrix_to_quaternion, quaternion_to_matrix 8 | 9 | from detectron2.structures import Instances 10 | 11 | from tridet.layers import bev_nms 12 | from tridet.structures.boxes3d import GenericBoxes3D 13 | from tridet.structures.pose import Pose 14 | 15 | 16 | def _indices_to_mask(indices, size): 17 | mask = indices.new_zeros(size, dtype=torch.bool) 18 | mask[indices] = True 19 | return mask 20 | 21 | 22 | def sample_bev_nms(instances, poses, category_key='pred_classes', iou_threshold=0.3): 23 | boxes3d_global = [] 24 | for _instances, pose in zip(instances, poses): 25 | # pose_SO 26 | box3d_vec = _instances.pred_boxes3d.vectorize() 27 | quat, tvec, wlh = box3d_vec[:, :4], box3d_vec[:, 4:7], box3d_vec[:, 7:10] 28 | R = quaternion_to_matrix(quat) 29 | rotation = t3d.Rotate(R=R.transpose(1, 2), device=quat.device) 30 | translation = t3d.Translate(tvec, device=quat.device) 31 | tfm_SO = rotation.compose(translation) 32 | 33 | # pose_WS 34 | quat, tvec = quat.new(pose.quat.elements), tvec.new(pose.tvec) 35 | R = quaternion_to_matrix(quat) 36 | rotation = t3d.Rotate(R=R.transpose(0, 1), device=quat.device) 37 | translation = t3d.Translate(tvec.unsqueeze(0), device=quat.device) 38 | tfm_WS = rotation.compose(translation) 39 | 40 | # boxes in global frame. 41 | tfm_WO = tfm_SO.compose(tfm_WS) 42 | pose_WO = tfm_WO.get_matrix().transpose(1, 2) 43 | rotation = pose_WO[:, :3, :3] 44 | quat = matrix_to_quaternion(rotation) 45 | tvec = pose_WO[:, :3, -1] 46 | 47 | boxes3d_global.append(torch.hstack([quat, tvec, wlh])) 48 | 49 | boxes3d_global = torch.vstack(boxes3d_global) 50 | boxes3d_global = GenericBoxes3D(boxes3d_global[:, :4], boxes3d_global[:, 4:7], boxes3d_global[:, 7:]) 51 | 52 | _ids = torch.cat([x.get(category_key) for x in instances]) 53 | scores = torch.cat([x.scores_3d for x in instances]) 54 | keep = bev_nms(boxes3d_global, scores, iou_threshold, pose_cam_global=Pose(), class_idxs=_ids) 55 | return keep, boxes3d_global 56 | 57 | 58 | def nuscenes_sample_aggregate( 59 | instances, 60 | group_idxs, 61 | num_classes, 62 | global_poses, 63 | iou_threshold, 64 | include_boxes3d_global=True, 65 | max_num_dets_per_sample=None 66 | ): 67 | """ 68 | Parameters 69 | ---------- 70 | instances: List[Instances] 71 | Predicted instances. 72 | 73 | group_idxs: dict 74 | Mapping from nuScene's `sample_token` to a list of indices of `instances.` 75 | 76 | num_classes: int 77 | Number of classes. 78 | 79 | pose_global: List[Pose] 80 | List of global poses for each image (or Instances) 81 | """ 82 | num_images = len(instances) 83 | for group_idx, (_, idxs) in enumerate(group_idxs.items()): 84 | group_id = group_idx * num_classes 85 | for idx in idxs: 86 | instances[idx].image_id = torch.ones_like(instances[idx].pred_classes) * idx 87 | instances[idx].sample_category_id = instances[idx].pred_classes + group_id 88 | keep, boxes3d_global = sample_bev_nms( 89 | instances, global_poses, category_key='sample_category_id', iou_threshold=iou_threshold 90 | ) 91 | 92 | # NOTE: NuScenes allow max. 500 detections per sample 93 | if max_num_dets_per_sample: 94 | keep = keep[:max_num_dets_per_sample] 95 | 96 | instances = Instances.cat(instances) 97 | if include_boxes3d_global: 98 | instances.pred_boxes3d_global = boxes3d_global 99 | instances.remove('sample_category_id') 100 | 101 | mask = _indices_to_mask(keep, len(instances)) 102 | _filtered_instances = instances[mask] 103 | filtered_instances = [] 104 | for image_id in range(num_images): 105 | _instances = _filtered_instances[_filtered_instances.image_id == image_id] 106 | _instances.remove('image_id') 107 | filtered_instances.append(_instances) 108 | return filtered_instances 109 | 110 | 111 | def get_group_idxs(sample_tokens, num_images_per_sample, inverse=False): 112 | grouped_idxs = defaultdict(list) 113 | for idx, token in enumerate(sample_tokens): 114 | grouped_idxs[token].append(idx) 115 | group_sizes = {token: len(idxs) for token, idxs in grouped_idxs.items()} 116 | 117 | if not all([siz == num_images_per_sample for siz in group_sizes.values()]): 118 | pprint(group_sizes) 119 | raise ValueError("Group sizes does not match with 'num_images_per_sample'.") 120 | 121 | token_to_idxs = OrderedDict(grouped_idxs) 122 | if not inverse: 123 | return token_to_idxs 124 | else: 125 | idx_to_token = OrderedDict() 126 | for token, idxs in token_to_idxs.items(): 127 | for idx in idxs: 128 | idx_to_token[idx] = token 129 | return idx_to_token 130 | -------------------------------------------------------------------------------- /tridet/modeling/dd3d/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def get_fpn_out_channels(output_shape): 3 | out_channels = [] 4 | if isinstance(output_shape, list): 5 | out_channels = [x.channels for x in output_shape] 6 | elif isinstance(output_shape, dict): 7 | out_channels = [x.channels for x in output_shape.values()] 8 | assert len(set(out_channels)) == 1, "The feature extractor must produce same channels of features for all levels." 9 | out_channels = out_channels[0] 10 | return out_channels 11 | -------------------------------------------------------------------------------- /tridet/modeling/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.modeling.backbone import BACKBONE_REGISTRY, Backbone 4 | 5 | from tridet.modeling.feature_extractor.dla import ( 6 | build_dla_backbone, build_dla_fpn_backbone, build_fcos_dla_fpn_backbone_p6, build_fcos_dla_fpn_backbone_p67 7 | ) 8 | from tridet.modeling.feature_extractor.vovnet import ( 9 | build_fcos_vovnet_fpn_backbone_p6, build_vovnet_backbone, build_vovnet_fpn_backbone 10 | ) 11 | 12 | 13 | def build_feature_extractor(cfg, input_shape=None): 14 | """ 15 | Build a backbone from `cfg.FE.BUILDER` 16 | 17 | Returns: 18 | an instance of :class:`Backbone` 19 | """ 20 | if input_shape is None: 21 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 22 | 23 | builder_name = cfg.FE.BUILDER 24 | feature_extractor = BACKBONE_REGISTRY.get(builder_name)(cfg, input_shape) 25 | assert isinstance(feature_extractor, Backbone) 26 | return feature_extractor 27 | -------------------------------------------------------------------------------- /tridet/structures/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from tridet.structures.image_list import ImageList 3 | -------------------------------------------------------------------------------- /tridet/structures/image_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | from __future__ import division 4 | 5 | from typing import Any, List, Sequence, Tuple 6 | 7 | import torch 8 | from torch import device 9 | from torch.nn import functional as F 10 | 11 | from detectron2.utils.env import TORCH_VERSION 12 | 13 | 14 | def _as_tensor(x: Tuple[int, int]) -> torch.Tensor: 15 | """ 16 | An equivalent of `torch.as_tensor`, but works under tracing if input 17 | is a list of tensor. `torch.as_tensor` will record a constant in tracing, 18 | but this function will use `torch.stack` instead. 19 | """ 20 | if torch.jit.is_scripting(): 21 | return torch.as_tensor(x) 22 | if isinstance(x, (list, tuple)) and all([isinstance(t, torch.Tensor) for t in x]): 23 | return torch.stack(x) 24 | return torch.as_tensor(x) 25 | 26 | 27 | class ImageList(object): 28 | """ 29 | Adapted from detectron2: 30 | https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/image_list.py) 31 | 32 | Key differences: 33 | - add optional intrinsics 34 | - add optional image path (useful for debugging) 35 | ================================================================================================================== 36 | 37 | Structure that holds a list of images (of possibly 38 | varying sizes) as a single tensor. 39 | This works by padding the images to the same size, 40 | and storing in a field the original sizes of each image 41 | 42 | Attributes: 43 | image_sizes (list[tuple[int, int]]): each tuple is (h, w) 44 | """ 45 | def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]], intrinsics=None, image_paths=None): 46 | """ 47 | Arguments: 48 | tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1 49 | image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can 50 | be smaller than (H, W) due to padding. 51 | """ 52 | self.tensor = tensor 53 | self.image_sizes = image_sizes 54 | self._intrinsics = intrinsics 55 | self._image_paths = image_paths 56 | 57 | @property 58 | def intrinsics(self): 59 | if torch.allclose(self._intrinsics[0], torch.eye(3, device=self._intrinsics.device)): 60 | # TODO: torch.inverse(images.intrinsics) often return identity, when it shouldn't. Is it pytorch bug? 61 | raise ValueError("Intrinsics is Identity.") 62 | return self._intrinsics 63 | 64 | @property 65 | def image_paths(self): 66 | return self._image_paths 67 | 68 | def __len__(self) -> int: 69 | return len(self.image_sizes) 70 | 71 | def __getitem__(self, idx) -> torch.Tensor: 72 | """ 73 | Access the individual image in its original size. 74 | 75 | Args: 76 | idx: int or slice 77 | 78 | Returns: 79 | Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1 80 | """ 81 | size = self.image_sizes[idx] 82 | return self.tensor[idx, ..., :size[0], :size[1]] 83 | 84 | @torch.jit.unused 85 | def to(self, *args: Any, **kwargs: Any) -> "ImageList": 86 | cast_tensor = self.tensor.to(*args, **kwargs) 87 | return ImageList(cast_tensor, self.image_sizes, intrinsics=self.intrinsics) 88 | 89 | @property 90 | def device(self) -> device: 91 | return self.tensor.device 92 | 93 | @staticmethod 94 | def from_tensors( 95 | tensors: List[torch.Tensor], 96 | size_divisibility: int = 0, 97 | pad_value: float = 0.0, 98 | intrinsics=None, 99 | image_paths=None 100 | ) -> "ImageList": 101 | """ 102 | Args: 103 | tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or 104 | (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded 105 | to the same shape with `pad_value`. 106 | size_divisibility (int): If `size_divisibility > 0`, add padding to ensure 107 | the common height and width is divisible by `size_divisibility`. 108 | This depends on the model and many models need a divisibility of 32. 109 | pad_value (float): value to pad 110 | 111 | Returns: 112 | an `ImageList`. 113 | """ 114 | assert len(tensors) > 0 115 | assert isinstance(tensors, (tuple, list)) 116 | for t in tensors: 117 | assert isinstance(t, torch.Tensor), type(t) 118 | assert t.shape[:-2] == tensors[0].shape[:-2], t.shape 119 | 120 | image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors] 121 | image_sizes_tensor = [_as_tensor(x) for x in image_sizes] 122 | max_size = torch.stack(image_sizes_tensor).max(0).values 123 | 124 | if size_divisibility > 1: 125 | stride = size_divisibility 126 | # the last two dims are H,W, both subject to divisibility requirement 127 | max_size = torch.div(max_size + (stride - 1), stride, rounding_mode='floor') * stride 128 | 129 | # handle weirdness of scripting and tracing ... 130 | if torch.jit.is_scripting(): 131 | max_size: List[int] = max_size.to(dtype=torch.long).tolist() 132 | else: 133 | # https://github.com/pytorch/pytorch/issues/42448 134 | if TORCH_VERSION >= (1, 7) and torch.jit.is_tracing(): 135 | image_sizes = image_sizes_tensor 136 | 137 | if len(tensors) == 1: 138 | # This seems slightly (2%) faster. 139 | # TODO: check whether it's faster for multiple images as well 140 | image_size = image_sizes[0] 141 | padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]] 142 | batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0) 143 | else: 144 | # max_size can be a tensor in tracing mode, therefore convert to list 145 | batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size) 146 | batched_imgs = tensors[0].new_full(batch_shape, pad_value) 147 | for img, pad_img in zip(tensors, batched_imgs): 148 | pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img) 149 | 150 | if intrinsics is not None: 151 | assert isinstance(intrinsics, (tuple, list)) 152 | assert len(intrinsics) == len(tensors) 153 | intrinsics = torch.stack(intrinsics, dim=0) 154 | 155 | if image_paths is not None: 156 | assert len(image_paths) == len(tensors) 157 | 158 | return ImageList(batched_imgs.contiguous(), image_sizes, intrinsics, image_paths) 159 | -------------------------------------------------------------------------------- /tridet/structures/pose.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import numpy as np 3 | from pyquaternion import Quaternion 4 | 5 | 6 | class Pose: 7 | """SE(3) rigid transform class that allows compounding of 6-DOF poses 8 | and provides common transformations that are commonly seen in geometric problems. 9 | """ 10 | def __init__(self, wxyz=np.float32([1., 0., 0., 0.]), tvec=np.float32([0., 0., 0.])): 11 | """Initialize a Pose with Quaternion and 3D Position 12 | 13 | Parameters 14 | ---------- 15 | wxyz: np.float32 or Quaternion (default: np.float32([1,0,0,0])) 16 | Quaternion/Rotation (wxyz) 17 | 18 | tvec: np.float32 (default: np.float32([0,0,0])) 19 | Translation (xyz) 20 | """ 21 | assert isinstance(wxyz, (np.ndarray, Quaternion)) 22 | assert isinstance(tvec, np.ndarray) 23 | 24 | if isinstance(wxyz, np.ndarray): 25 | assert np.abs(1.0 - np.linalg.norm(wxyz)) < 1.0e-3 26 | 27 | self.quat = Quaternion(wxyz) 28 | self.tvec = tvec 29 | 30 | def __repr__(self): 31 | formatter = {'float_kind': lambda x: '%.2f' % x} 32 | tvec_str = np.array2string(self.tvec, formatter=formatter) 33 | return 'wxyz: {}, tvec: ({})'.format(self.quat, tvec_str) 34 | 35 | def copy(self): 36 | """Return a copy of this pose object. 37 | 38 | Returns 39 | ---------- 40 | result: Pose 41 | Copied pose object. 42 | """ 43 | return self.__class__(Quaternion(self.quat), self.tvec.copy()) 44 | 45 | def __mul__(self, other): 46 | """Left-multiply Pose with another Pose or 3D-Points. 47 | 48 | Parameters 49 | ---------- 50 | other: Pose or np.ndarray 51 | 1. Pose: Identical to oplus operation. 52 | (i.e. self_pose * other_pose) 53 | 2. ndarray: transform [N x 3] point set 54 | (i.e. X' = self_pose * X) 55 | 56 | Returns 57 | ---------- 58 | result: Pose or np.ndarray 59 | Transformed pose or point cloud 60 | """ 61 | if isinstance(other, Pose): 62 | assert isinstance(other, self.__class__) 63 | t = self.quat.rotate(other.tvec) + self.tvec 64 | q = self.quat * other.quat 65 | return self.__class__(q, t) 66 | elif isinstance(other, np.ndarray): 67 | assert other.shape[-1] == 3, 'Point cloud is not 3-dimensional' 68 | X = np.hstack([other, np.ones((len(other), 1))]).T 69 | return (np.dot(self.matrix, X).T)[:, :3] 70 | else: 71 | return NotImplemented 72 | 73 | def __rmul__(self, other): 74 | raise NotImplementedError('Right multiply not implemented yet!') 75 | 76 | def inverse(self): 77 | """Returns a new Pose that corresponds to the 78 | inverse of this one. 79 | 80 | Returns 81 | ---------- 82 | result: Pose 83 | Inverted pose 84 | """ 85 | qinv = self.quat.inverse 86 | return self.__class__(qinv, qinv.rotate(-self.tvec)) 87 | 88 | @property 89 | def matrix(self): 90 | """Returns a 4x4 homogeneous matrix of the form [R t; 0 1] 91 | 92 | Returns 93 | ---------- 94 | result: np.ndarray 95 | 4x4 homogeneous matrix 96 | """ 97 | result = self.quat.transformation_matrix 98 | result[:3, 3] = self.tvec 99 | return result 100 | 101 | @property 102 | def rotation_matrix(self): 103 | """Returns the 3x3 rotation matrix (R) 104 | 105 | Returns 106 | ---------- 107 | result: np.ndarray 108 | 3x3 rotation matrix 109 | """ 110 | result = self.quat.transformation_matrix 111 | return result[:3, :3] 112 | 113 | @property 114 | def rotation(self): 115 | """Return the rotation component of the pose as a Quaternion object. 116 | 117 | Returns 118 | ---------- 119 | self.quat: Quaternion 120 | Rotation component of the Pose object. 121 | """ 122 | return self.quat 123 | 124 | @property 125 | def translation(self): 126 | """Return the translation component of the pose as a np.ndarray. 127 | 128 | Returns 129 | ---------- 130 | self.tvec: np.ndarray 131 | Translation component of the Pose object. 132 | """ 133 | return self.tvec 134 | 135 | @classmethod 136 | def from_matrix(cls, transformation_matrix): 137 | """Initialize pose from 4x4 transformation matrix 138 | 139 | Parameters 140 | ---------- 141 | transformation_matrix: np.ndarray 142 | 4x4 containing rotation/translation 143 | 144 | Returns 145 | ------- 146 | Pose 147 | """ 148 | return cls(wxyz=Quaternion(matrix=transformation_matrix[:3, :3]), tvec=np.float32(transformation_matrix[:3, 3])) 149 | 150 | @classmethod 151 | def from_rotation_translation(cls, rotation_matrix, tvec): 152 | """Initialize pose from rotation matrix and translation vector. 153 | 154 | Parameters 155 | ---------- 156 | rotation_matrix : np.ndarray 157 | 3x3 rotation matrix 158 | tvec : np.ndarray 159 | length-3 translation vector 160 | """ 161 | return cls(wxyz=Quaternion(matrix=rotation_matrix), tvec=np.float64(tvec)) 162 | 163 | def __eq__(self, other): 164 | return self.quat == other.quat and (self.tvec == other.tvec).all() 165 | -------------------------------------------------------------------------------- /tridet/utils/comm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | from functools import wraps 4 | 5 | import torch.distributed as dist 6 | from mpi4py import MPI # pylint: disable=unused-import 7 | 8 | from detectron2.utils import comm as d2_comm 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | _NESTED_BROADCAST_FROM_MASTER = False 13 | 14 | 15 | def is_distributed(): 16 | return d2_comm.get_world_size() > 1 17 | 18 | 19 | def broadcast_from_master(fn): 20 | """If distributed, only the master executes the function and broadcast the results to other workers. 21 | 22 | Usage: 23 | @broadcast_from_master 24 | def foo(a, b): ... 25 | """ 26 | @wraps(fn) 27 | def wrapper(*args, **kwargs): # pylint: disable=unused-argument 28 | global _NESTED_BROADCAST_FROM_MASTER 29 | 30 | if not is_distributed(): 31 | return fn(*args, **kwargs) 32 | 33 | if _NESTED_BROADCAST_FROM_MASTER: 34 | assert d2_comm.is_main_process() 35 | LOG.warning(f"_NESTED_BROADCAST_FROM_MASTER = True, {fn.__name__}") 36 | return fn(*args, **kwargs) 37 | 38 | if d2_comm.is_main_process(): 39 | _NESTED_BROADCAST_FROM_MASTER = True 40 | ret = fn(*args, **kwargs) 41 | _NESTED_BROADCAST_FROM_MASTER = False 42 | else: 43 | ret = None 44 | 45 | ret = MPI.COMM_WORLD.bcast(ret, root=0) 46 | 47 | assert ret is not None 48 | return ret 49 | 50 | return wrapper 51 | 52 | 53 | def master_only(fn): 54 | """If distributed, only the master executes the function. 55 | 56 | Usage: 57 | @master_only 58 | def foo(a, b): ... 59 | """ 60 | @wraps(fn) 61 | def wrapped_fn(*args, **kwargs): 62 | if d2_comm.is_main_process(): 63 | ret = fn(*args, **kwargs) 64 | d2_comm.synchronize() 65 | if d2_comm.is_main_process(): 66 | return ret 67 | 68 | return wrapped_fn 69 | 70 | 71 | def gather_dict(dikt): 72 | """Gather python dictionaries from all workers to the rank=0 worker. 73 | 74 | Assumption: the keys of `dikt` are disjoint across all workers. 75 | 76 | If rank = 0, then returned aggregated dict. 77 | If rank > 0, then return `None`. 78 | """ 79 | dict_lst = d2_comm.gather(dikt, dst=0) 80 | if d2_comm.is_main_process(): 81 | gathered_dict = {} 82 | for dic in dict_lst: 83 | for k in dic.keys(): 84 | assert k not in gathered_dict, f"Dictionary key overlaps: {k}" 85 | gathered_dict.update(dic) 86 | return gathered_dict 87 | else: 88 | return None 89 | 90 | 91 | def reduce_sum(tensor): 92 | """ 93 | Adapted from AdelaiDet: 94 | https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py 95 | """ 96 | if not is_distributed(): 97 | return tensor 98 | tensor = tensor.clone() 99 | dist.all_reduce(tensor, op=dist.ReduceOp.SUM) 100 | return tensor 101 | -------------------------------------------------------------------------------- /tridet/utils/events.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | # Adapted from detectron2: 4 | # https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/events.py 5 | import wandb 6 | from detectron2.utils.events import EventStorage 7 | 8 | from tridet.utils.comm import master_only 9 | 10 | 11 | class WandbEventStorage(EventStorage): 12 | 13 | @master_only 14 | def put_scalar(self, name, value, smoothing_hint=True, wandb_log=True): 15 | super().put_scalar(name, value, smoothing_hint=smoothing_hint) 16 | 17 | # Add W&B logging 18 | name = self._current_prefix + name 19 | value = float(value) 20 | if wandb_log and wandb.run: 21 | wandb.log({name: value}, step=self.iter) 22 | -------------------------------------------------------------------------------- /tridet/utils/geometry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | 4 | import cv2 5 | import numpy as np 6 | import torch 7 | from pytorch3d.transforms.rotation_conversions import matrix_to_quaternion, quaternion_to_matrix 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | PI = 3.14159265358979323846 12 | EPS = 1e-7 13 | 14 | 15 | def allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics): 16 | """ 17 | Parameters 18 | ---------- 19 | quat: Tensor 20 | (N, 4). Batch of (allocentric) quaternions. 21 | 22 | proj_ctr: Tensor 23 | (N, 2). Projected centers. xy coordninates. 24 | 25 | inv_intrinsics: [type] 26 | (N, 3, 3). Inverted intrinsics. 27 | """ 28 | R_obj_to_local = quaternion_to_matrix(quat) 29 | 30 | # ray == z-axis in local orientaion 31 | ray = unproject_points2d(proj_ctr, inv_intrinsics) 32 | z = ray / ray.norm(dim=1, keepdim=True) 33 | 34 | # gram-schmit process: local_y = global_y - global_y \dot local_z 35 | y = z.new_tensor([[0., 1., 0.]]) - z[:, 1:2] * z 36 | y = y / y.norm(dim=1, keepdim=True) 37 | x = torch.cross(y, z, dim=1) 38 | 39 | # local -> global 40 | R_local_to_global = torch.stack([x, y, z], dim=-1) 41 | 42 | # obj -> global 43 | R_obj_to_global = torch.bmm(R_local_to_global, R_obj_to_local) 44 | 45 | egocentric_quat = matrix_to_quaternion(R_obj_to_global) 46 | 47 | # Make sure it's unit norm. 48 | quat_norm = egocentric_quat.norm(dim=1, keepdim=True) 49 | if not torch.allclose(quat_norm, torch.as_tensor(1.), atol=1e-3): 50 | LOG.warning( 51 | f"Some of the input quaternions are not unit norm: min={quat_norm.min()}, max={quat_norm.max()}; therefore normalizing." 52 | ) 53 | egocentric_quat = egocentric_quat / quat_norm.clamp(min=EPS) 54 | 55 | return egocentric_quat 56 | 57 | 58 | def homogenize_points(xy): 59 | """ 60 | Parameters 61 | ---------- 62 | xy: Tensor 63 | xy coordinates. shape=(N, ..., 2) 64 | E.g., (N, 2) or (N, K, 2) or (N, H, W, 2) 65 | 66 | Returns 67 | ------- 68 | Tensor: 69 | 1. is appended to the last dimension. shape=(N, ..., 3) 70 | E.g, (N, 3) or (N, K, 3) or (N, H, W, 3). 71 | """ 72 | # NOTE: this seems to work for arbitrary number of dimensions of input 73 | pad = torch.nn.ConstantPad1d(padding=(0, 1), value=1.) 74 | return pad(xy) 75 | 76 | 77 | def project_points3d(Xw, K): 78 | _, C = Xw.shape 79 | assert C == 3 80 | uv, _ = cv2.projectPoints( 81 | Xw, np.zeros((3, 1), dtype=np.float32), np.zeros(3, dtype=np.float32), K, np.zeros(5, dtype=np.float32) 82 | ) 83 | return uv.reshape(-1, 2) 84 | 85 | 86 | def unproject_points2d(points2d, inv_K, scale=1.0): 87 | """ 88 | Parameters 89 | ---------- 90 | points2d: Tensor 91 | xy coordinates. shape=(N, ..., 2) 92 | E.g., (N, 2) or (N, K, 2) or (N, H, W, 2) 93 | 94 | inv_K: Tensor 95 | Inverted intrinsics; shape=(N, 3, 3) 96 | 97 | scale: float, default: 1.0 98 | Scaling factor. 99 | 100 | Returns 101 | ------- 102 | Tensor: 103 | Unprojected 3D point. shape=(N, ..., 3) 104 | E.g., (N, 3) or (N, K, 3) or (N, H, W, 3) 105 | """ 106 | points2d = homogenize_points(points2d) 107 | siz = points2d.size() 108 | points2d = points2d.view(-1, 3).unsqueeze(-1) # (N, 3, 1) 109 | unprojected = torch.matmul(inv_K, points2d) # (N, 3, 3) x (N, 3, 1) -> (N, 3, 1) 110 | unprojected = unprojected.view(siz) 111 | 112 | return unprojected * scale 113 | 114 | 115 | def get_pixel_sizes_perspective_cams(cams): 116 | """Get physical pixel size of pinhole cameras. 117 | 118 | ((1 / fx) ** 2 + (1 / fy) ** 2)).sqrt() 119 | 120 | Parameters 121 | ---------- 122 | cams: PerspectiveCameras 123 | [description] 124 | """ 125 | inv_intrinsics = cams.get_projection_transform().inverse().get_matrix() 126 | return inv_intrinsics.diagonal(dim1=1, dim2=2).norm(dim=1) 127 | -------------------------------------------------------------------------------- /tridet/utils/hydra/callbacks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from hydra.experimental.callback import Callback 5 | from mpi4py import MPI 6 | 7 | from detectron2.utils import comm as d2_comm 8 | from detectron2.utils.logger import setup_logger 9 | 10 | from tridet.utils.s3 import aws_credential_is_available, maybe_download_ckpt_from_url, sync_output_dir_s3 11 | from tridet.utils.setup import setup_distributed 12 | from tridet.utils.wandb import derive_output_dir_from_wandb_id, init_wandb, wandb_credential_is_available 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | class SetupDistributedCallback(Callback): 18 | """ 19 | """ 20 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 21 | world_size = MPI.COMM_WORLD.Get_size() 22 | distributed = world_size > 1 23 | if distributed: 24 | rank = MPI.COMM_WORLD.Get_rank() 25 | setup_distributed(world_size, rank) 26 | 27 | def on_job_start(self, config, **kwargs): # pylint: disable=unused-argument 28 | world_size = d2_comm.get_world_size() 29 | rank = d2_comm.get_rank() 30 | LOG.info("Rank of current process: {}. World size: {}".format(rank, world_size)) 31 | 32 | 33 | class WandbInitCallback(Callback): 34 | """If W&B is enabled, then 35 | 1) initialize W&B, 36 | 2) derive the path of output directory using W&B ID, and 37 | 3) set it as hydra working directory. 38 | """ 39 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 40 | if not config.WANDB.ENABLED: 41 | return 42 | if not wandb_credential_is_available(): 43 | LOG.warning( 44 | "W&B credential must be defined in environment variables." 45 | "Use `WANDB.ENABLED=False` to suppress this warning. " 46 | "Skipping `WandbInitCallback`..." 47 | ) 48 | return 49 | 50 | init_wandb(config) 51 | output_dir = derive_output_dir_from_wandb_id(config) 52 | if output_dir: 53 | config.hydra.run.dir = output_dir 54 | 55 | 56 | class SyncOutputDirCallback(Callback): 57 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 58 | if d2_comm.is_main_process(): 59 | output_dir = config.hydra.run.dir 60 | else: 61 | output_dir = None 62 | output_dir = MPI.COMM_WORLD.bcast(output_dir, root=0) 63 | 64 | if output_dir != config.hydra.run.dir: 65 | LOG.warning("Hydra run dir is not synced. Overwriting from rank=0.") 66 | config.hydra.run.dir = output_dir 67 | 68 | 69 | class D2LoggerCallback(Callback): 70 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 71 | rank = d2_comm.get_rank() 72 | log_output_dir = os.path.join(config.hydra.run.dir, 'logs') 73 | setup_logger(log_output_dir, distributed_rank=rank, name="hydra") 74 | setup_logger(log_output_dir, distributed_rank=rank, name="detectron2", abbrev_name="d2") 75 | setup_logger(log_output_dir, distributed_rank=rank, name="tridet") 76 | setup_logger(log_output_dir, distributed_rank=rank, name="fvcore") 77 | 78 | logging.getLogger('numba').setLevel(logging.ERROR) # too much logs 79 | 80 | 81 | class CkptPathResolverCallback(Callback): 82 | """ 83 | If the checkpoint (`config.model.CKPT`) is an S3 path, then downloaded it and replace the path with 84 | local path. 85 | """ 86 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 87 | if config.MODEL.CKPT: 88 | new_ckpt_path = maybe_download_ckpt_from_url(config) 89 | new_ckpt_path = os.path.abspath(new_ckpt_path) 90 | config.MODEL.CKPT = new_ckpt_path 91 | 92 | 93 | class SyncOutputS3BeforeEnd(Callback): 94 | """ 95 | """ 96 | def on_run_start(self, config, **kwargs): # pylint: disable=unused-argument 97 | if config.SYNC_OUTPUT_DIR_S3.ENABLED and not aws_credential_is_available(): 98 | raise ValueError(f"\n\nAWS credential must be set in environment variables (rank={d2_comm.get_rank()}).\n") 99 | 100 | def on_run_end(self, config, **kwargs): # pylint: disable=unused-argument 101 | """ 102 | """ 103 | if config.SYNC_OUTPUT_DIR_S3.ENABLED: 104 | sync_output_dir_s3(config, output_dir=config.hydra.run.dir) 105 | -------------------------------------------------------------------------------- /tridet/utils/s3.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import os 4 | import subprocess 5 | import tempfile 6 | import time 7 | 8 | import requests 9 | from hydra.utils import to_absolute_path 10 | from tqdm import tqdm 11 | 12 | import wandb 13 | from detectron2.utils import comm 14 | 15 | from tridet.utils.comm import broadcast_from_master 16 | from tridet.utils.wandb import wandb_is_initialized 17 | 18 | LOG = logging.getLogger(__name__) 19 | 20 | 21 | @broadcast_from_master 22 | def maybe_download_ckpt_from_url(cfg): 23 | """If the checkpoint is an S3 or https path, the main process download the weight under, by default, `/tmp/`. 24 | 25 | NOTE: All workers must update `cfg.MODEL.CKPT` to use the new path. 26 | """ 27 | ckpt_path = cfg.MODEL.CKPT 28 | 29 | if ckpt_path.startswith("s3://") or ckpt_path.startswith("https://"): 30 | os.makedirs(cfg.TMP_DIR, exist_ok=True) 31 | _, ext = os.path.splitext(ckpt_path) 32 | tmp_path = tempfile.NamedTemporaryFile(dir=cfg.TMP_DIR, suffix=ext).name 33 | 34 | LOG.info("Downloading initial weights:") 35 | LOG.info(f" src: {ckpt_path}") 36 | LOG.info(f" dst: {tmp_path}") 37 | 38 | if ckpt_path.startswith("s3://"): 39 | if not aws_credential_is_available(): 40 | raise ValueError('AWS credentials are undefined in environment variables.') 41 | s3_copy(ckpt_path, tmp_path) 42 | else: # https:// 43 | req = requests.get(ckpt_path) 44 | with open(tmp_path, 'wb') as f: 45 | for chunk in tqdm(req.iter_content(100000)): 46 | f.write(chunk) 47 | return tmp_path 48 | 49 | else: 50 | return ckpt_path 51 | 52 | 53 | def aws_credential_is_available(): 54 | AWS_CREDENTIALS = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] 55 | for x in AWS_CREDENTIALS: 56 | if not os.environ.get(x, None): 57 | return False 58 | return True 59 | 60 | 61 | def s3_copy(source_path, target_path, verbose=True): 62 | """Copy single file from local to s3, s3 to local, or s3 to s3. 63 | 64 | Parameters 65 | ---------- 66 | source_path: str 67 | Path of file to copy 68 | 69 | target_path: str 70 | Path to copy file to 71 | 72 | verbose: bool, default: True 73 | If True print some helpful messages 74 | 75 | Returns 76 | ------- 77 | bool: True if successful 78 | """ 79 | if verbose: 80 | logging.getLogger().setLevel(logging.DEBUG) 81 | 82 | success = False 83 | command_str = "aws s3 cp --acl bucket-owner-full-control {} {}".format(source_path, target_path) 84 | try: 85 | subprocess.check_output(command_str, shell=True) 86 | success = True 87 | except subprocess.CalledProcessError as e: 88 | success = False 89 | LOG.error("{} failed with error code {}".format(command_str, e.returncode)) 90 | LOG.error(e.output) 91 | if verbose: 92 | LOG.info("Done copying file") 93 | 94 | return success 95 | 96 | 97 | def sync_dir(source, target, verbose=True, excludes=None): 98 | """ 99 | Sync a directory from source to target (either local to s3, s3 to s3, s3 to local) 100 | 101 | Parameters 102 | ---------- 103 | source: str 104 | Directory from which we want to sync files 105 | 106 | target: str 107 | Directory to which all files will be synced 108 | 109 | verbose: bool, default: True 110 | If True, log some helpful messages 111 | """ 112 | assert source.startswith('s3://') or target.startswith('s3://') 113 | command_str = "aws s3 sync --quiet --acl bucket-owner-full-control {} {}".format(source, target) 114 | if excludes: 115 | for exclude in excludes: 116 | command_str += f" --exclude '{exclude}'" 117 | if verbose: 118 | LOG.info("Syncing with '{}'".format(command_str)) 119 | try: 120 | subprocess.check_output(command_str, shell=True) 121 | except subprocess.CalledProcessError as e: 122 | LOG.error("{} failed with error code {}".format(command_str, e.returncode)) 123 | LOG.error(e.output) 124 | if verbose: 125 | LOG.info("Done syncing") 126 | 127 | 128 | def sync_output_dir_s3(cfg, output_dir=None): 129 | output_dir = output_dir or os.getcwd() 130 | output_dir = os.path.abspath(os.path.normpath(output_dir)) 131 | output_root = to_absolute_path(cfg.OUTPUT_ROOT) 132 | 133 | assert os.path.commonprefix([output_dir, output_root]) == output_root, f'{output_dir}, {output_root}' 134 | tar_output_dir = os.path.join(cfg.SYNC_OUTPUT_DIR_S3.ROOT_IN_S3, output_dir[len(output_root) + 1:]) 135 | 136 | if comm.is_main_process(): 137 | LOG.info(f"Syncing output_dir: {output_dir} -> {tar_output_dir}") 138 | sync_dir(output_dir, tar_output_dir) 139 | 140 | if wandb_is_initialized(): 141 | tar_wandb_run_dir = os.path.join(tar_output_dir, 'wandb') 142 | LOG.info(f"Syncing W&B run dir: {wandb.run.dir} -> {tar_wandb_run_dir}") 143 | sync_dir(wandb.run.dir, tar_wandb_run_dir) 144 | 145 | elif comm.get_local_rank() == 0 and os.path.exists(os.path.join(output_dir, 'logs')): 146 | # local master -- only sync the log files 147 | log_output_dir, log_tar_output_dir = os.path.join(output_dir, 'logs'), os.path.join(tar_output_dir, 'logs') 148 | LOG.info(f"Syncing log output_dir: {log_output_dir} -> {log_tar_output_dir}") 149 | sync_dir(log_output_dir, log_tar_output_dir) 150 | 151 | 152 | def maybe_download_from_s3(src_path): 153 | if not src_path.startswith("s3://"): 154 | return src_path 155 | 156 | extension = os.path.splitext(src_path)[-1] 157 | if not extension: 158 | extension = None 159 | tmp_path = tempfile.NamedTemporaryFile(suffix=extension).name 160 | suceeded = s3_copy(src_path, tmp_path) 161 | if not suceeded: 162 | raise RuntimeError("`s3_copy` failed.") 163 | return tmp_path 164 | 165 | 166 | def maybe_sync_dir_from_s3(src_path, excludes=None): 167 | if not src_path.startswith("s3://"): 168 | return src_path 169 | 170 | tmp_dir = tempfile.NamedTemporaryFile().name 171 | os.makedirs(tmp_dir) 172 | LOG.info(f"Syncing {src_path} to {tmp_dir}") 173 | st = time.time() 174 | sync_dir(src_path, tmp_dir, excludes=excludes) 175 | LOG.info(f"Done. ({time.time() - st}s)") 176 | return tmp_dir 177 | -------------------------------------------------------------------------------- /tridet/utils/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | import json 4 | import logging 5 | import os 6 | import resource 7 | from datetime import datetime 8 | 9 | import torch 10 | import torch.distributed as dist 11 | from omegaconf import OmegaConf 12 | 13 | import detectron2.utils.comm as d2_comm 14 | from detectron2.utils.env import seed_all_rng 15 | from detectron2.utils.events import _CURRENT_STORAGE_STACK 16 | 17 | from tridet.utils.comm import broadcast_from_master 18 | from tridet.utils.events import WandbEventStorage 19 | 20 | LOG = logging.getLogger(__name__) 21 | 22 | 23 | def setup_distributed(world_size, rank): 24 | """ 25 | Adapted from detectron2: 26 | https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py#L85 27 | """ 28 | host = os.environ["MASTER_ADDR"] if "MASTER_ADDR" in os.environ else "127.0.0.1" 29 | port = 12345 30 | dist_url = f"tcp://{host}:{port}" 31 | try: 32 | dist.init_process_group(backend='NCCL', init_method=dist_url, world_size=world_size, rank=rank) 33 | except Exception as e: 34 | logging.error("Process group URL: %s", dist_url) 35 | raise e 36 | # synchronize is needed here to prevent a possible timeout after calling init_process_group 37 | # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 38 | d2_comm.synchronize() 39 | 40 | # Assumption: all machines have the same number of GPUs. 41 | num_gpus_per_machine = torch.cuda.device_count() 42 | machine_rank = rank // num_gpus_per_machine 43 | 44 | # Setup the local process group (which contains ranks within the same machine) 45 | assert d2_comm._LOCAL_PROCESS_GROUP is None 46 | num_machines = world_size // num_gpus_per_machine 47 | for i in range(num_machines): 48 | ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) 49 | pg = dist.new_group(ranks_on_i) 50 | if i == machine_rank: 51 | d2_comm._LOCAL_PROCESS_GROUP = pg 52 | 53 | # Declare GPU device. 54 | local_rank = rank % num_gpus_per_machine 55 | torch.cuda.set_device(local_rank) 56 | 57 | # Multi-node training often fails with "received 0 items of ancdata" error. 58 | # https://github.com/fastai/fastai/issues/23#issuecomment-345091054 59 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 60 | resource.setrlimit(resource.RLIMIT_NOFILE, (8192, rlimit[1])) 61 | 62 | 63 | @broadcast_from_master 64 | def get_random_seed(): 65 | """Adapted from d2.utils.env:seed_all_rng()""" 66 | seed = os.getpid() + int(datetime.now().strftime("%S%f")) + int.from_bytes(os.urandom(2), "big") 67 | return seed 68 | 69 | 70 | def setup(cfg): 71 | assert torch.cuda.is_available(), "cuda is not available." 72 | 73 | # Seed random number generators. If distributed, then sync the random seed over all GPUs. 74 | seed = get_random_seed() 75 | seed_all_rng(seed) 76 | 77 | LOG.info("Working Directory: {}".format(os.getcwd())) 78 | LOG.info("Full config:\n{}".format(json.dumps(OmegaConf.to_container(cfg, resolve=True), indent=2))) 79 | 80 | # Set up EventStorage 81 | storage = WandbEventStorage() 82 | _CURRENT_STORAGE_STACK.append(storage) 83 | 84 | # After this, the cfg is immutable. 85 | OmegaConf.set_readonly(cfg, True) 86 | -------------------------------------------------------------------------------- /tridet/utils/tasks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from collections import OrderedDict 3 | 4 | from detectron2.config import configurable 5 | 6 | 7 | class Task(): 8 | def __init__(self, name, is_detection_task, is_dense_prediction_task): 9 | self.name = name 10 | self.is_detection_task = is_detection_task 11 | self.is_dense_prediction_task = is_dense_prediction_task 12 | 13 | 14 | # yapf: disable 15 | TASKS = [ 16 | Task( 17 | name="box2d", 18 | is_detection_task=True, 19 | is_dense_prediction_task=False, 20 | ), 21 | Task( 22 | name="box3d", 23 | is_detection_task=True, 24 | is_dense_prediction_task=False, 25 | ), 26 | Task( 27 | name="depth", 28 | is_detection_task=False, 29 | is_dense_prediction_task=True, 30 | ) 31 | ] 32 | # yapf: enable 33 | 34 | NAME_TO_TASK = OrderedDict([(task.name, task) for task in TASKS]) 35 | 36 | 37 | class TaskManager(): 38 | @configurable 39 | def __init__(self, box2d_on=False, box3d_on=False, depth_on=False): 40 | """ 41 | configurable is experimental. 42 | """ 43 | self._box2d_on = self._mask2d_on = self._box3d_on = self._semseg2d_on = self._depth_on = False 44 | tasks = [] 45 | if box2d_on: 46 | tasks.append(NAME_TO_TASK['box2d']) 47 | self._box2d_on = True 48 | if box3d_on: 49 | tasks.append(NAME_TO_TASK['box3d']) 50 | self._box3d_on = True 51 | if depth_on: 52 | tasks.append(NAME_TO_TASK['depth']) 53 | self._depth_on = True 54 | 55 | if not tasks: 56 | raise ValueError("No task specified.") 57 | 58 | self._tasks = tasks 59 | 60 | @property 61 | def tasks(self): 62 | return self._tasks 63 | 64 | @classmethod 65 | def from_config(cls, cfg): 66 | # yapf: disable 67 | return OrderedDict( 68 | box2d_on = cfg.MODEL.BOX2D_ON, 69 | box3d_on = cfg.MODEL.BOX3D_ON, 70 | depth_on = cfg.MODEL.DEPTH_ON, 71 | ) 72 | # yapf: enable 73 | 74 | # Indicators that tells if each task is enabled. 75 | @property 76 | def box2d_on(self): 77 | return self._box2d_on 78 | 79 | @property 80 | def box3d_on(self): 81 | return self._box3d_on 82 | 83 | @property 84 | def depth_on(self): 85 | return self._depth_on 86 | 87 | @property 88 | def has_dense_prediction_task(self): 89 | return any([task.is_dense_prediction_task for task in self.tasks]) 90 | 91 | @property 92 | def has_detection_task(self): 93 | return any([task.is_detection_task for task in self.tasks]) 94 | 95 | @property 96 | def task_names(self): 97 | return [task.name for task in self.tasks] 98 | -------------------------------------------------------------------------------- /tridet/utils/tensor2d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def compute_features_locations(h, w, stride, dtype=torch.float32, device='cpu', offset="none"): 7 | """Adapted from AdelaiDet: 8 | https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py 9 | 10 | Key differnece: offset is configurable. 11 | """ 12 | shifts_x = torch.arange(0, w * stride, step=stride, dtype=dtype, device=device) 13 | shifts_y = torch.arange(0, h * stride, step=stride, dtype=dtype, device=device) 14 | shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) 15 | shift_x = shift_x.reshape(-1) 16 | shift_y = shift_y.reshape(-1) 17 | # (dennis.park) 18 | # locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 19 | locations = torch.stack((shift_x, shift_y), dim=1) 20 | if offset == "half": 21 | locations += stride // 2 22 | else: 23 | assert offset == "none" 24 | 25 | return locations 26 | 27 | 28 | def aligned_bilinear(tensor, factor, offset="none"): 29 | """Adapted from AdelaiDet: 30 | https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py 31 | """ 32 | assert tensor.dim() == 4 33 | assert factor >= 1 34 | assert int(factor) == factor 35 | 36 | if factor == 1: 37 | return tensor 38 | 39 | h, w = tensor.size()[2:] 40 | tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") 41 | oh = factor * h + 1 42 | ow = factor * w + 1 43 | tensor = F.interpolate(tensor, size=(oh, ow), mode='bilinear', align_corners=True) 44 | if offset == "half": 45 | tensor = F.pad(tensor, pad=(factor // 2, 0, factor // 2, 0), mode="replicate") 46 | 47 | return tensor[:, :, :oh - 1, :ow - 1] 48 | -------------------------------------------------------------------------------- /tridet/utils/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import os 4 | 5 | from tabulate import tabulate 6 | from termcolor import colored 7 | 8 | from detectron2.utils.events import get_event_storage 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | def get_inference_output_dir(dataset_name, is_last=False, use_tta=False, root_output_dir=None): 14 | if not root_output_dir: 15 | root_output_dir = os.getcwd() # hydra 16 | step = get_event_storage().iter 17 | if is_last: 18 | result_dirname = "final" 19 | else: 20 | result_dirname = f"step{step:07d}" 21 | if use_tta: 22 | result_dirname += "-tta" 23 | output_dir = os.path.join(root_output_dir, "inference", result_dirname, dataset_name) 24 | return output_dir 25 | 26 | 27 | def print_test_results(test_results): 28 | metric_table = tabulate( 29 | [(k, v) for k, v in test_results.items()], 30 | headers=["metric", "value"], 31 | tablefmt="pipe", 32 | numalign="left", 33 | stralign="left", 34 | ) 35 | LOG.info("Test results:\n" + colored(metric_table, "cyan")) 36 | -------------------------------------------------------------------------------- /tridet/utils/visualization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | import colorsys 4 | import os 5 | 6 | import cv2 7 | import matplotlib.colors as mplc 8 | import numpy as np 9 | from PIL import Image, ImageDraw 10 | 11 | from tridet.utils.wandb import flatten_dict 12 | 13 | 14 | def fill_color_polygon(image, polygon, color, alpha=0.5): 15 | """Color interior of polygon with alpha-blending. This function modified input in place. 16 | """ 17 | _mask = Image.new('L', (image.shape[1], image.shape[0]), 0) 18 | ImageDraw.Draw(_mask).polygon(polygon, outline=1, fill=1) 19 | mask = np.array(_mask, np.bool) 20 | for c in range(3): 21 | channel = image[:, :, c] 22 | channel[mask] = channel[mask] * (1. - alpha) + color[c] * alpha 23 | 24 | 25 | def save_vis(np_arrays_dict, output_dir, filename, step=None): 26 | np_arrays_dict = flatten_dict(np_arrays_dict) 27 | npz_filename = os.path.join(output_dir, '' if step is None else f"step{step:06d}", filename) 28 | os.makedirs(os.path.dirname(npz_filename), exist_ok=True) 29 | np.savez_compressed(npz_filename, **np_arrays_dict) 30 | 31 | 32 | def change_color_brightness(color, brightness_factor): 33 | """ 34 | Copied from detectron2.utils.visualizer.py 35 | ------------------------------------------- 36 | 37 | Depending on the brightness_factor, gives a lighter or darker color i.e. a color with 38 | less or more saturation than the original color. 39 | 40 | Args: 41 | color: color of the polygon. Refer to `matplotlib.colors` for a full list of 42 | formats that are accepted. 43 | brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of 44 | 0 will correspond to no change, a factor in [-1.0, 0) range will result in 45 | a darker color and a factor in (0, 1.0] range will result in a lighter color. 46 | 47 | Returns: 48 | modified_color (tuple[double]): a tuple containing the RGB values of the 49 | modified color. Each value in the tuple is in the [0.0, 1.0] range. 50 | """ 51 | assert brightness_factor >= -1.0 and brightness_factor <= 1.0 52 | color = mplc.to_rgb(color) 53 | polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) 54 | modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) 55 | modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness 56 | modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness 57 | modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) 58 | return modified_color 59 | 60 | 61 | def draw_text(ax, text, position, *, font_size, color="g", horizontal_alignment="center", rotation=0): 62 | """ 63 | Copied from Visualizer.draw_text() 64 | ----------------------------------- 65 | 66 | Args: 67 | text (str): class label 68 | position (tuple): a tuple of the x and y coordinates to place text on image. 69 | font_size (int, optional): font of the text. If not provided, a font size 70 | proportional to the image width is calculated and used. 71 | color: color of the text. Refer to `matplotlib.colors` for full list 72 | of formats that are accepted. 73 | horizontal_alignment (str): see `matplotlib.text.Text` 74 | rotation: rotation angle in degrees CCW 75 | 76 | Returns: 77 | output (VisImage): image object with text drawn. 78 | """ 79 | # since the text background is dark, we don't want the text to be dark 80 | color = np.maximum(list(mplc.to_rgb(color)), 0.2) 81 | color[np.argmax(color)] = max(0.8, np.max(color)) 82 | 83 | x, y = position 84 | ax.text( 85 | x, 86 | y, 87 | text, 88 | size=font_size, 89 | family="sans-serif", 90 | bbox={ 91 | "facecolor": "black", 92 | "alpha": 0.8, 93 | "pad": 0.7, 94 | "edgecolor": "none" 95 | }, 96 | verticalalignment="top", 97 | horizontalalignment=horizontal_alignment, 98 | color=color, 99 | zorder=10, 100 | rotation=rotation, 101 | ) 102 | return ax 103 | 104 | 105 | def float_to_uint8_color(float_clr): 106 | assert all([c >= 0. for c in float_clr]) 107 | assert all([c <= 1. for c in float_clr]) 108 | return [int(c * 255.) for c in float_clr] 109 | 110 | 111 | def mosaic(items, scale=1.0, pad=3, grid_width=None): 112 | """Creates a mosaic from list of images. 113 | 114 | Parameters 115 | ---------- 116 | items: list of np.ndarray 117 | List of images to mosaic. 118 | 119 | scale: float, default=1.0 120 | Scale factor applied to images. scale > 1.0 enlarges images. 121 | 122 | pad: int, default=3 123 | Padding size of the images before mosaic 124 | 125 | grid_width: int, default=None 126 | Mosaic width or grid width of the mosaic 127 | 128 | Returns 129 | ------- 130 | image: np.array of shape (H, W, 3) 131 | Image mosaic 132 | """ 133 | # Determine tile width and height 134 | N = len(items) 135 | assert N > 0, 'No items to mosaic!' 136 | grid_width = grid_width if grid_width else np.ceil(np.sqrt(N)).astype(int) 137 | grid_height = np.ceil(N * 1. / grid_width).astype(np.int) 138 | input_size = items[0].shape[:2] 139 | target_shape = (int(input_size[1] * scale), int(input_size[0] * scale)) 140 | mosaic_items = [] 141 | for j in range(grid_width * grid_height): 142 | if j < N: 143 | # Only the first image is scaled, the rest are re-shaped 144 | # to the same size as the previous image in the mosaic 145 | im = cv2.resize(items[j], dsize=target_shape) 146 | mosaic_items.append(im) 147 | else: 148 | mosaic_items.append(np.zeros_like(mosaic_items[-1])) 149 | 150 | # Stack W tiles horizontally first, then vertically 151 | im_pad = lambda im: cv2.copyMakeBorder(im, pad, pad, pad, pad, cv2.BORDER_CONSTANT, 0) 152 | mosaic_items = [im_pad(im) for im in mosaic_items] 153 | hstack = [np.hstack(mosaic_items[j:j + grid_width]) for j in range(0, len(mosaic_items), grid_width)] 154 | mosaic_viz = np.vstack(hstack) if len(hstack) > 1 \ 155 | else hstack[0] 156 | return mosaic_viz 157 | -------------------------------------------------------------------------------- /tridet/utils/wandb.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | import logging 3 | import os 4 | from collections import OrderedDict 5 | from collections.abc import Mapping 6 | from functools import wraps 7 | 8 | import wandb 9 | from detectron2.utils.events import get_event_storage 10 | from omegaconf import OmegaConf 11 | 12 | from tridet.utils.comm import broadcast_from_master, master_only 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | def wandb_credential_is_available(): 18 | if os.environ.get('WANDB_API_KEY', None): 19 | return True 20 | else: 21 | return False 22 | 23 | 24 | @master_only 25 | def init_wandb(cfg): 26 | if not wandb_credential_is_available(): 27 | LOG.warning( 28 | "W&B credential must be defined in environment variables." 29 | "Use `WANDB.ENABLED=False` to suppress this warning. " 30 | "Skipping `init_wandb`..." 31 | ) 32 | return 33 | 34 | if cfg.WANDB.DRYRUN: 35 | os.environ['WANDB_MODE'] = 'dryrun' 36 | 37 | _cfg = cfg.copy() 38 | del _cfg.hydra 39 | cfg_as_dict = OmegaConf.to_container(_cfg, resolve=True) 40 | wandb.init(project=cfg.WANDB.PROJECT, config=cfg_as_dict, tags=cfg.WANDB.TAGS, group=cfg.WANDB.GROUP) 41 | 42 | 43 | def wandb_is_initialized(): 44 | try: 45 | wandb.run.id # pylint: disable=pointless-statement 46 | initialized = True 47 | except AttributeError: 48 | initialized = False 49 | return initialized 50 | 51 | 52 | def if_wandb_initialized(fn): 53 | @wraps(fn) 54 | def wrapped_fn(*args, **kwargs): 55 | if wandb_is_initialized(): 56 | return fn(*args, **kwargs) 57 | else: 58 | return None 59 | 60 | return wrapped_fn 61 | 62 | 63 | @broadcast_from_master 64 | def derive_output_dir_from_wandb_id(cfg): 65 | assert wandb_is_initialized() 66 | wandb_run_dir = wandb.run.dir 67 | if wandb_run_dir.endswith('/files'): # wandb 0.10.x 68 | wandb_run_dir = wandb_run_dir[:-6] 69 | datetime_str, wandb_run_id = wandb_run_dir.split('-')[-2:] 70 | assert wandb_run_id == wandb.run.id 71 | 72 | output_dir = os.path.join(cfg.OUTPUT_ROOT, '-'.join([wandb_run_id, datetime_str])) 73 | return output_dir 74 | 75 | 76 | @master_only 77 | @if_wandb_initialized 78 | def log_nested_dict(dikt): 79 | storage = get_event_storage() 80 | step = storage.iter 81 | 82 | wandb.log(flatten_dict(dikt), step=step) 83 | 84 | 85 | def flatten_dict(results): 86 | """ 87 | Almost identical to detectron2.evaluation.testing:flatten_result_dict()', but using 'OrderedDict' 88 | -------------------------------------------------------------------------------------------------- 89 | 90 | Expand a hierarchical dict of scalars into a flat dict of scalars. 91 | If results[k1][k2][k3] = v, the returned dict will have the entry 92 | {"k1/k2/k3": v}. 93 | 94 | Args: 95 | results (dict): 96 | """ 97 | r = OrderedDict() 98 | for k, v in results.items(): 99 | k = str(k) 100 | if isinstance(v, Mapping): 101 | v = flatten_dict(v) 102 | for kk, vv in v.items(): 103 | r[k + "/" + kk] = vv 104 | else: 105 | r[k] = v 106 | return r 107 | -------------------------------------------------------------------------------- /tridet/visualizers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Toyota Research Institute. All rights reserved. 2 | from detectron2.data.catalog import DatasetCatalog, MetadataCatalog 3 | 4 | from tridet.visualizers.box3d_visualizer import Box3DDataloaderVisualizer, Box3DPredictionVisualizer 5 | from tridet.visualizers.d2_visualizer import D2DataloaderVisualizer, D2PredictionVisualizer 6 | 7 | 8 | def get_predictions_visualizer(cfg, visualizer_name, dataset_name, inference_output_dir): 9 | if visualizer_name == 'd2_visualizer': 10 | return D2PredictionVisualizer(cfg, dataset_name, inference_output_dir) 11 | elif visualizer_name == "box3d_visualizer": 12 | return Box3DPredictionVisualizer(cfg, dataset_name, inference_output_dir) 13 | else: 14 | raise ValueError(f"Invalid visualizer: {visualizer_name}") 15 | 16 | 17 | def get_dataloader_visualizer(cfg, visualizer_name, dataset_name): 18 | if visualizer_name == 'd2_visualizer': 19 | return D2DataloaderVisualizer(cfg, dataset_name) 20 | elif visualizer_name == "box3d_visualizer": 21 | return Box3DDataloaderVisualizer(cfg, dataset_name) 22 | else: 23 | raise ValueError(f"Invalid visualizer: {visualizer_name}") 24 | -------------------------------------------------------------------------------- /tridet/visualizers/d2_visualizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Copyright 2021 Toyota Research Institute. All rights reserved. 3 | import json 4 | import logging 5 | import os 6 | from collections import OrderedDict, defaultdict 7 | 8 | import numpy as np 9 | import torch 10 | 11 | from detectron2.data import DatasetCatalog, MetadataCatalog 12 | from detectron2.data import detection_utils as d2_utils 13 | from detectron2.structures import Boxes, BoxMode, Instances 14 | from detectron2.utils.visualizer import ColorMode, Visualizer 15 | 16 | DETECTION_RESULT_FILE = "coco_instances_results.json" 17 | SEMSEG_RESULT_FILE = "sem_seg_predictions.json" 18 | 19 | D2_COLORMODE_MAPPING = { 20 | "image": ColorMode.IMAGE, 21 | "segm": ColorMode.SEGMENTATION, 22 | "image_bw": ColorMode.IMAGE_BW, 23 | } 24 | 25 | LOG = logging.getLogger(__name__) 26 | 27 | 28 | def get_tasks_from_cfg(cfg): 29 | tasks = [] 30 | if cfg.MODEL.BOX2D_ON: 31 | tasks.append('bbox2d') 32 | assert len(tasks) > 0, "Empty task." 33 | return tasks 34 | 35 | 36 | def create_instances(predictions, image_size, score_threshold, metadata, score_key="score"): 37 | ret = Instances(image_size) 38 | 39 | # score = np.asarray([x["score"] for x in predictions]) 40 | score = np.asarray([x[score_key] for x in predictions]) 41 | chosen = (score > score_threshold).nonzero()[0] 42 | score = score[chosen] 43 | bbox = np.asarray([predictions[i]["bbox"] for i in chosen]).reshape(-1, 4) 44 | bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) 45 | 46 | # dataset_id_map = metadata.thing_dataset_id_to_contiguous_id 47 | if not hasattr(metadata, 'thing_dataset_id_to_contiguous_id'): 48 | # (dennis.park) Assume the `category_id` is already a contiguous IDs starting at 0. 49 | dataset_id_map = {idx: idx for idx, _ in enumerate(metadata.thing_classes)} 50 | else: 51 | dataset_id_map = metadata.thing_dataset_id_to_contiguous_id 52 | labels = np.asarray([dataset_id_map[predictions[i]["category_id"]] for i in chosen]) 53 | 54 | ret.scores = score 55 | ret.pred_boxes = Boxes(bbox) 56 | ret.pred_classes = labels 57 | 58 | # Add bbox3d 59 | try: 60 | ret.pred_boxes3d = torch.as_tensor([predictions[i]["bbox3d"] for i in chosen]) 61 | except KeyError: 62 | pass 63 | return ret 64 | 65 | 66 | class D2PredictionVisualizer(): 67 | """ 68 | Adapted from detectron2: 69 | detectron2.utils.visualizer 70 | 71 | Key difference: load inference results on disk generated by COCOEvaluator 72 | """ 73 | def __init__(self, cfg, dataset_name, inference_output_dir): 74 | self._metadata = MetadataCatalog.get(dataset_name) 75 | self._input_format = cfg.INPUT.FORMAT 76 | self._scale = cfg.VIS.D2.PREDICTIONS.SCALE 77 | self._d2_viz_color_mode = D2_COLORMODE_MAPPING[cfg.VIS.D2.PREDICTIONS.COLOR_MODE] 78 | 79 | tasks = get_tasks_from_cfg(cfg) 80 | dataset_dicts = DatasetCatalog.get(dataset_name) 81 | 82 | # Per-image predicted instances 83 | self.pred_instances_by_image = None 84 | if "bbox2d" in tasks: 85 | with open(os.path.join(inference_output_dir, DETECTION_RESULT_FILE), 'r') as f: 86 | instance_predictions = json.load(f) 87 | 88 | pred_instances_by_image = defaultdict(list) 89 | for p in instance_predictions: 90 | # 'p' is key'ed by 'image_id'. 91 | image_id = p['image_id'] 92 | pred_instances_by_image[image_id].append(p) 93 | 94 | # det2d_threshold = cfg.VIS.PREDICTIONS.DET2D_THRESHOLD 95 | det2d_threshold = cfg.VIS.D2.PREDICTIONS.THRESHOLD 96 | # This handles images with no predictions. 97 | for dataset_dict in dataset_dicts: 98 | image_id = dataset_dict['image_id'] 99 | img_shape = (dataset_dict['height'], dataset_dict['width']) 100 | pred_instances_by_image[image_id] = create_instances( 101 | pred_instances_by_image[image_id], img_shape, det2d_threshold, self._metadata 102 | ) 103 | 104 | self.pred_instances_by_image = pred_instances_by_image 105 | LOG.info( 106 | f"Found 2D detection predictions (bbox2d and/or mask2d) for {len(pred_instances_by_image)} images." 107 | ) 108 | 109 | def visualize(self, x): 110 | """ 111 | Parameters 112 | ---------- 113 | x: Dict 114 | One 'dataset_dict'. 115 | 116 | Returns 117 | ------- 118 | viz_images: Dict[np.array] 119 | Visualizations as RGB images. 120 | """ 121 | # Load image. 122 | img = d2_utils.read_image(x["file_name"], format=self._input_format) 123 | img = d2_utils.convert_image_to_rgb(img, self._input_format) 124 | 125 | viz_images = OrderedDict() 126 | 127 | # d2 groundtruth instances viz (2D box, mask, keypoints) 128 | if 'annotations' in x: 129 | # Visualizer.draw_datset_dict() renders various types of annotations. 130 | # But here we only use its capability to render *instance() annotations. 131 | _x = {'annotations': x['annotations']} 132 | viz = Visualizer(img, self._metadata, scale=self._scale, instance_mode=self._d2_viz_color_mode) 133 | viz_image = viz.draw_dataset_dict(_x).get_image() 134 | viz_images["viz_gt_instances_d2"] = viz_image 135 | 136 | # d2 instance predictions viz (2D box, mask, keypoints) 137 | if self.pred_instances_by_image is not None: 138 | pred_instances = self.pred_instances_by_image[x['image_id']] 139 | viz = Visualizer(img, self._metadata, scale=self._scale, instance_mode=self._d2_viz_color_mode) 140 | viz_image = viz.draw_instance_predictions(pred_instances).get_image() 141 | viz_images["viz_pred_instance_d2"] = viz_image 142 | 143 | return viz_images 144 | 145 | 146 | def draw_gt_instances_d2(gt_instances, img, metadata, scale, instance_mode): 147 | """Wrapper of D2's 'Visualizer.draw_instance_predictions()' to render GT instances. 148 | """ 149 | # Rename instance fields to work with Visualizer.draw_instance_predictions() of detectron2. 150 | field_remapping = { 151 | 'gt_boxes': 'pred_boxes', 152 | 'gt_classes': 'pred_classes', 153 | } 154 | fields = {} 155 | for k, v in gt_instances._fields.items(): 156 | new_k = field_remapping.get(k, None) 157 | k = new_k or k 158 | fields[k] = v 159 | 160 | instances = Instances(image_size=gt_instances._image_size, **fields) 161 | viz = Visualizer(img, metadata, scale=scale, instance_mode=instance_mode) 162 | viz_image = viz.draw_instance_predictions(instances).get_image() 163 | return viz_image 164 | 165 | 166 | class D2DataloaderVisualizer(): 167 | def __init__(self, cfg, dataset_name): 168 | self._metadata = MetadataCatalog.get(dataset_name) 169 | self._input_format = cfg.INPUT.FORMAT 170 | self._scale = cfg.VIS.D2.DATALOADER.SCALE 171 | self._d2_viz_color_mode = D2_COLORMODE_MAPPING[cfg.VIS.D2.DATALOADER.COLOR_MODE] 172 | 173 | def visualize(self, x): 174 | # Assumption: dataloader produce CHW images. 175 | img = d2_utils.convert_image_to_rgb(x['image'].permute(1, 2, 0), self._input_format) 176 | 177 | viz_images = OrderedDict() 178 | 179 | # d2 instance viz (2D box, mask, keypoints) 180 | gt_instances = x['instances'] 181 | viz_image = draw_gt_instances_d2(gt_instances, img, self._metadata, self._scale, self._d2_viz_color_mode) 182 | viz_images['viz_gt_instances_d2'] = viz_image 183 | 184 | return viz_images 185 | --------------------------------------------------------------------------------