├── requirements.txt ├── .gitignore ├── models ├── ops │ ├── MultiScaleDeformableAttention.egg-info │ │ ├── dependency_links.txt │ │ ├── top_level.txt │ │ ├── PKG-INFO │ │ └── SOURCES.txt │ ├── build │ │ ├── temp.linux-x86_64-cpython-37 │ │ │ ├── .ninja_deps │ │ │ ├── mnt │ │ │ │ └── dolphinfs │ │ │ │ │ └── hdd_pool │ │ │ │ │ └── docker │ │ │ │ │ └── user │ │ │ │ │ └── hadoop-vacv │ │ │ │ │ └── yanfeng │ │ │ │ │ └── project │ │ │ │ │ └── MOTRv2 │ │ │ │ │ └── MOTRv3 │ │ │ │ │ └── models │ │ │ │ │ └── ops │ │ │ │ │ └── src │ │ │ │ │ ├── vision.o │ │ │ │ │ ├── cpu │ │ │ │ │ └── ms_deform_attn_cpu.o │ │ │ │ │ └── cuda │ │ │ │ │ └── ms_deform_attn_cuda.o │ │ │ ├── .ninja_log │ │ │ └── build.ninja │ │ ├── lib.linux-x86_64-3.8 │ │ │ ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ └── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ ├── lib.linux-x86_64-cpython-37 │ │ │ ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so │ │ │ ├── modules │ │ │ │ └── __init__.py │ │ │ └── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ └── temp.linux-x86_64-3.8 │ │ │ └── mnt │ │ │ └── dolphinfs │ │ │ └── hdd_pool │ │ │ └── docker │ │ │ └── user │ │ │ └── hadoop-vacv │ │ │ └── yanfeng │ │ │ └── project │ │ │ └── MOTRv2 │ │ │ └── CO-MOT │ │ │ └── models │ │ │ └── ops │ │ │ └── src │ │ │ ├── vision.o │ │ │ ├── cpu │ │ │ └── ms_deform_attn_cpu.o │ │ │ └── cuda │ │ │ └── ms_deform_attn_cuda.o │ ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so │ ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so │ ├── dist │ │ └── MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── src │ │ ├── vision.cpp │ │ ├── cuda │ │ │ └── ms_deform_attn_cuda.h │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.h │ │ │ └── ms_deform_attn_cpu.cpp │ │ └── ms_deform_attn.h │ ├── setup.py │ └── test.py ├── structures │ └── __init__.py ├── dino │ ├── __init__.py │ ├── ops │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ ├── make.sh │ │ ├── src │ │ │ ├── vision.cpp │ │ │ ├── cuda │ │ │ │ └── ms_deform_attn_cuda.h │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn.h │ │ ├── setup.py │ │ └── test.py │ ├── position_encoding.py │ └── utils.py ├── registry.py ├── __init__.py ├── yolo_fpn.py ├── losses.py ├── yolo_pafpn.py ├── position_encoding.py ├── memory_bank.py ├── yolox.py ├── darknet.py └── network_blocks.py ├── tools ├── show_user_using_nvidia.sh ├── copy_back.sh ├── debug.sh ├── simple_inference.sh ├── simplebdd_inference.sh ├── simplemot_inference.sh ├── batch_diff.py ├── merge_dance_tracklets.sh ├── resume.sh ├── run_dist_launch.sh ├── eval_dance.sh ├── run_dist_slurm.sh ├── make_detdb.py ├── coco_evel.py ├── train.sh ├── clip_train.py ├── merge_dance_tracklets.py ├── train_ddp.sh ├── similarity_analysis.py └── visualize_tao.py ├── datasets ├── alignment.txt ├── __init__.py ├── panoptic_eval.py ├── data_prefetcher.py └── samplers.py ├── util ├── json_parser.py ├── __init__.py ├── checkpoint.py ├── box_ops.py ├── tool.py └── plot_utils.py ├── configs └── motrv2ch_uni5cost3ggoon.args └── .vscode └── launch.json /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | scipy 3 | opencv-python 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tracker*/ 2 | exps 3 | __pycache__ 4 | tmp 5 | checkpoints -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | MultiScaleDeformableAttention 2 | functions 3 | modules 4 | -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg -------------------------------------------------------------------------------- /tools/show_user_using_nvidia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pids=$(fuser -v /dev/nvidia* | cut -d' ' -f3- | tr ' ' '\n' | sort -u) 4 | for pid in $pids 5 | do 6 | echo "PID: $pid CWD: $(readlink /proc/$pid/cwd)" 7 | done -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /datasets/alignment.txt: -------------------------------------------------------------------------------- 1 | Describe this image in detail. 2 | Take a look at this image and describe what you notice. 3 | Please provide a detailed description of the picture. 4 | Could you describe the contents of this image for me? -------------------------------------------------------------------------------- /util/json_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | def parse(str, key): 5 | str_dict = json.loads(str) 6 | val = str_dict[key] 7 | if type(val)==list: 8 | return ",".join(val) 9 | else: 10 | return val 11 | 12 | if __name__ == '__main__': 13 | parse(sys.argv[1], sys.argv[2]) -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o -------------------------------------------------------------------------------- /tools/copy_back.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | 6 | set -x 7 | 8 | cp $1/*.py . 9 | cp $1/models/*.py models 10 | cp $1/datasets/*.py datasets 11 | -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: MultiScaleDeformableAttention 3 | Version: 1.0 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR 6 | Author: Weijie Su 7 | License: UNKNOWN 8 | Platform: UNKNOWN 9 | 10 | UNKNOWN 11 | 12 | -------------------------------------------------------------------------------- /tools/debug.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | 9 | args=$(cat $1) 10 | 11 | export CUDA_LAUNCH_BLOCKING=1 12 | python main.py ${args} --output_dir /tmp/clip_mot_v2 13 | -------------------------------------------------------------------------------- /tools/simple_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | set -o pipefail 9 | 10 | # args=$(cat configs/motrv2.args) 11 | args=$(cat $1) 12 | python3 submit_dance.py ${args} --exp_name tracker --resume $2 $3 13 | -------------------------------------------------------------------------------- /tools/simplebdd_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | set -o pipefail 9 | 10 | # args=$(cat configs/motrv2.args) 11 | args=$(cat $1) 12 | python3 submit_bdd.py ${args} --exp_name tracker --resume $2 $3 13 | -------------------------------------------------------------------------------- /tools/simplemot_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | set -o pipefail 9 | 10 | # args=$(cat configs/motrv2.args) 11 | args=$(cat $1) 12 | python3 submit_mot.py ${args} --exp_name tracker --resume $2 $3 13 | -------------------------------------------------------------------------------- /models/structures/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Detectron2 (https://github.com/facebookresearch/detectron2) 3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 4 | # ------------------------------------------------------------------------ 5 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, matched_boxlist_iou 6 | from .instances import Instances 7 | 8 | __all__ = [k for k in globals().keys() if not k.startswith("_")] -------------------------------------------------------------------------------- /configs/motrv2ch_uni5cost3ggoon.args: -------------------------------------------------------------------------------- 1 | --meta_arch motr_unincost 2 | --dataset_file e2e_dance 3 | --epoch 20 4 | --with_box_refine 5 | --lr_drop 8 6 | --lr 2e-4 7 | --lr_backbone 2e-5 8 | --pretrained xx/checkpoint0019.pth 9 | --batch_size 1 10 | --sample_mode random_interval 11 | --sample_interval 10 12 | --sampler_lengths 5 13 | --merger_dropout 0 14 | --dropout 0 15 | --random_drop 0.1 16 | --fp_ratio 0.3 17 | --query_interaction_layer GQIM 18 | --num_queries 60 19 | --append_crowd 20 | --use_checkpoint 21 | --mot_path xxx/data/ 22 | --match_type gmatch 23 | --g_size 3 24 | -------------------------------------------------------------------------------- /models/dino/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Conditional DETR 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Copied from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | # ------------------------------------------------------------------------ 9 | 10 | from .dino import build_dino 11 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | python setup.py build install 10 | -------------------------------------------------------------------------------- /models/dino/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /models/dino/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | -------------------------------------------------------------------------------- /tools/batch_diff.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | 6 | import argparse 7 | from glob import glob 8 | from subprocess import run 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('src') 13 | parser.add_argument('dst') 14 | args = parser.parse_args() 15 | 16 | 17 | for src in glob(args.src+'/*/*.py') + glob(args.src+'/*.py'): 18 | dst = src.replace(args.src, args.dst) 19 | if run(['diff', src, dst]).returncode != 0: 20 | print('code --diff', src, dst) 21 | -------------------------------------------------------------------------------- /models/dino/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | 11 | # TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='/path/to/your/cuda/dir' 12 | python setup.py build install 13 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn import MSDeformAttn -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-3.8/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn import MSDeformAttn -------------------------------------------------------------------------------- /tools/merge_dance_tracklets.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | python tools/merge_dance_tracklets.py $1 $2 6 | 7 | # python3 ../TrackEval/scripts/run_mot_challenge.py \ 8 | # --SPLIT_TO_EVAL val \ 9 | # --METRICS HOTA \ 10 | # --GT_FOLDER /data/datasets/dancetrack/val \ 11 | # --SEQMAP_FILE seqmap \ 12 | # --SKIP_SPLIT_FOL True \ 13 | # --TRACKER_SUB_FOLDER tracker \ 14 | # --TRACKERS_TO_EVAL $2 \ 15 | # --USE_PARALLEL True \ 16 | # --NUM_PARALLEL_CORES 8 \ 17 | # --PLOT_CURVES False \ 18 | # --TRACKERS_FOLDER '' | tee -a $2/eval.log 19 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-37/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn import MSDeformAttn -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 13 | 14 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-3.8/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 13 | 14 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-37/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 13 | 14 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/dino/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/.ninja_log: -------------------------------------------------------------------------------- 1 | # ninja log v5 2 | 8 50326 1682736456000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o 1a7a04fa8aa332bc 3 | 14 91444 1682736491000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o 781d7dd8aea58757 4 | 3 109768 1682736515000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o 67f3872547af6227 5 | -------------------------------------------------------------------------------- /tools/resume.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | 9 | set -o pipefail 10 | 11 | OUTPUT_DIR=$1 12 | 13 | # clean up *.pyc files 14 | rmpyc() { 15 | rm -rf $(find -name __pycache__) 16 | rm -rf $(find -name "*.pyc") 17 | } 18 | 19 | # tar src to avoid future editing 20 | cleanup() { 21 | echo "Packing source code" 22 | rmpyc 23 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files 24 | echo " ...Done" 25 | } 26 | 27 | 28 | pushd $OUTPUT_DIR 29 | trap cleanup EXIT 30 | 31 | args=$(cat *.args) 32 | python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py ${args} --resume checkpoint.pth --output_dir . |& tee -a resume.log 33 | popd 34 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/dino/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /tools/run_dist_launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | # ------------------------------------------------------------------------ 8 | # Modified from DETR (https://github.com/facebookresearch/detr) 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 10 | # ------------------------------------------------------------------------ 11 | 12 | 13 | set -x 14 | 15 | GPUS=$1 16 | RUN_COMMAND=${@:2} 17 | if [ $GPUS -lt 8 ]; then 18 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 19 | else 20 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 21 | fi 22 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 23 | MASTER_PORT=${MASTER_PORT:-"29500"} 24 | NODE_RANK=${NODE_RANK:-0} 25 | 26 | let "NNODES=GPUS/GPUS_PER_NODE" 27 | 28 | python3 ./tools/launch.py \ 29 | --nnodes ${NNODES} \ 30 | --node_rank ${NODE_RANK} \ 31 | --master_addr ${MASTER_ADDR} \ 32 | --master_port ${MASTER_PORT} \ 33 | --nproc_per_node ${GPUS_PER_NODE} \ 34 | ${RUN_COMMAND} -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models/dino/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /tools/eval_dance.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | 7 | set -x 8 | 9 | set -o pipefail 10 | 11 | OUTPUT_DIR=$1 12 | 13 | # clean up *.pyc files 14 | rmpyc() { 15 | rm -rf $(find -name __pycache__) 16 | rm -rf $(find -name "*.pyc") 17 | } 18 | 19 | 20 | cp submit_dance.py $OUTPUT_DIR 21 | 22 | pushd $OUTPUT_DIR 23 | 24 | args=$(cat *.args) 25 | # rlaunch --cpu 8 --gpu 1 --memory 24000 --positive-tags 2080ti -P 13 -- python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker 26 | python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker 27 | 28 | popd 29 | 30 | # python3 ../TrackEval/scripts/run_mot_challenge.py \ 31 | # --SPLIT_TO_EVAL val \ 32 | # --METRICS HOTA CLEAR Identity \ 33 | # --GT_FOLDER /data/datasets/dancetrack/val \ 34 | # --SEQMAP_FILE seqmap \ 35 | # --SKIP_SPLIT_FOL True \ 36 | # --TRACKER_SUB_FOLDER tracker \ 37 | # --TRACKERS_TO_EVAL $OUTPUT_DIR \ 38 | # --USE_PARALLEL True \ 39 | # --NUM_PARALLEL_CORES 8 \ 40 | # --PLOT_CURVES False \ 41 | # --TRACKERS_FOLDER '' | tee -a $OUTPUT_DIR/eval.log 42 | -------------------------------------------------------------------------------- /tools/run_dist_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | # ------------------------------------------------------------------------ 8 | # Modified from DETR (https://github.com/facebookresearch/detr) 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 10 | # ------------------------------------------------------------------------ 11 | 12 | 13 | set -x 14 | 15 | PARTITION=$1 16 | JOB_NAME=$2 17 | GPUS=$3 18 | RUN_COMMAND=${@:4} 19 | if [ $GPUS -lt 8 ]; then 20 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 21 | else 22 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 23 | fi 24 | CPUS_PER_TASK=${CPUS_PER_TASK:-4} 25 | SRUN_ARGS=${SRUN_ARGS:-""} 26 | 27 | srun -p ${PARTITION} \ 28 | --job-name=${JOB_NAME} \ 29 | --gres=gpu:${GPUS_PER_NODE} \ 30 | --ntasks=${GPUS} \ 31 | --ntasks-per-node=${GPUS_PER_NODE} \ 32 | --cpus-per-task=${CPUS_PER_TASK} \ 33 | --kill-on-bad-exit=1 \ 34 | ${SRUN_ARGS} \ 35 | ${RUN_COMMAND} 36 | 37 | -------------------------------------------------------------------------------- /tools/make_detdb.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | 6 | from glob import glob 7 | import json 8 | from concurrent.futures import ThreadPoolExecutor 9 | from threading import Lock 10 | 11 | from tqdm import tqdm 12 | 13 | det_db = {} 14 | to_cache = [] 15 | 16 | for file in glob("/data/Dataset/mot/crowdhuman/train_image/*.txt"): 17 | to_cache.append(file) 18 | 19 | for file in glob("/data/Dataset/mot/dancetrack/*/*/img1/*.txt"): 20 | to_cache.append(file) 21 | 22 | for file in glob("/data/Dataset/mot/MOT17/images/*/*/img1/*.txt"): 23 | to_cache.append(file) 24 | 25 | for file in glob("/data/Dataset/mot/MOT20/train/*/img1/*.txt"): 26 | to_cache.append(file) 27 | 28 | for file in glob("/data/Dataset/mot/HIE20/train/*/img1/*.txt"): 29 | to_cache.append(file) 30 | 31 | pbar = tqdm(total=len(to_cache)) 32 | 33 | mutex = Lock() 34 | def cache(file): 35 | with open(file) as f: 36 | tmp = [l for l in f] 37 | with mutex: 38 | det_db[file] = tmp 39 | pbar.update() 40 | 41 | with ThreadPoolExecutor(max_workers=48) as exe: 42 | for file in to_cache: 43 | exe.submit(cache, file) 44 | 45 | with open("/data/Dataset/mot/det_db_oc_sort_full.json", 'w') as f: 46 | json.dump(det_db, f) 47 | 48 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.cpp 3 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.cpp 4 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.cu 5 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/vision.cpp 6 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cpu/ms_deform_attn_cpu.cpp 7 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cuda/ms_deform_attn_cuda.cu 8 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp 9 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp 10 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu 11 | MultiScaleDeformableAttention.egg-info/PKG-INFO 12 | MultiScaleDeformableAttention.egg-info/SOURCES.txt 13 | MultiScaleDeformableAttention.egg-info/dependency_links.txt 14 | MultiScaleDeformableAttention.egg-info/top_level.txt 15 | functions/__init__.py 16 | functions/ms_deform_attn_func.py 17 | modules/__init__.py 18 | modules/ms_deform_attn.py -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /models/dino/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /tools/coco_evel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from collections import defaultdict 5 | from pycocotools.coco import COCO 6 | from pycocotools.cocoeval import COCOeval 7 | 8 | 9 | parser = argparse.ArgumentParser('Deformable DETR Detector', add_help=False) 10 | parser.add_argument('--det_root', default='tracker', type=str) 11 | args = parser.parse_args() 12 | 13 | cocoGt = COCO(annotation_file='data/dancetrack/annotations/val.json') 14 | 15 | 16 | det_root = args.det_root 17 | tracklets = defaultdict() 18 | 19 | detRes = [] 20 | for img_id in cocoGt.getImgIds(): 21 | img = cocoGt.loadImgs(img_id) 22 | 23 | vid_name = img[0]['file_name'][:14] 24 | frame_id = img[0]['frame_id'] 25 | 26 | if vid_name not in tracklets: 27 | tracklets[vid_name] = defaultdict(list) 28 | for line in open(os.path.join(det_root, vid_name+'.txt')): 29 | t, id, *xywhs = line.split(',')[:7] 30 | t, id = map(int, (t, id)) 31 | tracklets[vid_name][t].append((id, *map(float, xywhs))) 32 | 33 | labels = tracklets[vid_name][frame_id] 34 | 35 | for l in labels: 36 | ann = defaultdict() 37 | ann['image_id'] = img[0]['id'] 38 | ann['bbox'] = list(l[1:5]) 39 | ann['category_id'] = 1 40 | ann['score'] = l[5] 41 | detRes.append(ann) 42 | 43 | cocoDt = cocoGt.loadRes(detRes) #自己的生成的结果的路径及文件名,json文件形式 44 | cocoEval = COCOeval(cocoGt, cocoDt, "bbox") 45 | cocoEval.evaluate() 46 | cocoEval.accumulate() 47 | cocoEval.summarize() -------------------------------------------------------------------------------- /tools/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | # 打印所有指令 7 | set -x 8 | 9 | PY_ARGS=${@:2} 10 | 11 | # 脚本运行失败,报错 12 | set -o pipefail 13 | #sed -e :直接在指令列模式上進行 sed 的動作編輯; 14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g") 15 | mkdir -p $OUTPUT_BASE 16 | 17 | for RUN in $(seq 100); do 18 | ls $OUTPUT_BASE | grep run$RUN && continue 19 | OUTPUT_DIR=$OUTPUT_BASE/run$RUN 20 | mkdir $OUTPUT_DIR && break 21 | done 22 | 23 | # clean up *.pyc files 24 | rmpyc() { 25 | rm -rf $(find -name __pycache__) 26 | rm -rf $(find -name "*.pyc") 27 | } 28 | 29 | # run backup 30 | echo "Backing up to log dir: $OUTPUT_DIR" 31 | rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR 32 | echo " ...Done" 33 | 34 | # tar src to avoid future editing 35 | cleanup() { 36 | echo "Packing source code" 37 | rmpyc 38 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files 39 | echo " ...Done" 40 | } 41 | 42 | args=$(cat $1) 43 | 44 | pushd $OUTPUT_DIR 45 | trap cleanup EXIT 46 | 47 | # log git status 48 | echo "Logging git status" 49 | git status > git_status 50 | git rev-parse HEAD > git_tag 51 | git diff > git_diff 52 | echo $PY_ARGS > desc 53 | echo " ...Done" 54 | 55 | python -m torch.distributed.launch --nproc_per_node=4 --master_port 29504 --use_env main.py ${args} --output_dir $OUTPUT_DIR |& tee -a output.log 56 | -------------------------------------------------------------------------------- /tools/clip_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from collections import defaultdict 5 | 6 | 7 | root_data = 'data/MOT/MOT17_all/train' 8 | vids = os.listdir(root_data) 9 | 10 | for v in vids: 11 | if 'SDP' in v: 12 | labels_full = defaultdict(list) 13 | gt_path = os.path.join(root_data, v, 'gt', 'gt.txt') 14 | for l in open(gt_path): 15 | t, i, *xywh = l.strip().split(',') 16 | labels_full[int(t)].append([i, *xywh]) 17 | imgs_root = os.path.join(root_data, v, 'img1') 18 | imgs_path = sorted(os.listdir(imgs_root)) 19 | 20 | for ith, img_p in enumerate(imgs_path): 21 | if ith < (len(imgs_path)+1)//2: 22 | save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17') 23 | save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17') 24 | print('train: %d', save_img) 25 | else: 26 | save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17').replace('train', 'val') 27 | save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17').replace('train', 'val') 28 | print('val: %d', save_img) 29 | os.makedirs(os.path.dirname(save_label), exist_ok=True) 30 | with open(save_label, 'a+') as f: 31 | if ith+1 in labels_full: 32 | for l in labels_full[ith+1]: 33 | f.write('%d,%s,%s,%s,%s,%s,%s,%s,%s\n'%(ith+1, *l)) 34 | img = cv2.imread(os.path.join(imgs_root, img_p)) 35 | os.makedirs(os.path.dirname(save_img), exist_ok=True) 36 | cv2.imwrite(save_img, img) -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | from .dance import build as build_e2e_dance 12 | from .dance_test import build as build_e2e_dance_test 13 | from .tao import build as build_e2e_tao 14 | from .joint import build as build_e2e_joint 15 | from .mot import build as build_e2e_mot 16 | from .all import build as build_e2e_all 17 | from .bdd100k import build as build_e2e_bdd 18 | from .bdd100kcoco import build as build_e2e_bddcc 19 | 20 | 21 | def build_dataset(image_set, args): 22 | if args.dataset_file == 'e2e_joint': 23 | return build_e2e_joint(image_set, args) 24 | elif args.dataset_file == 'e2e_dance': 25 | return build_e2e_dance(image_set, args) 26 | elif args.dataset_file == 'e2e_dance_test': 27 | return build_e2e_dance_test(image_set, args) 28 | elif args.dataset_file == 'e2e_all': 29 | return build_e2e_all(image_set, args) 30 | elif args.dataset_file == 'e2e_bdd': 31 | return build_e2e_bdd(image_set, args) 32 | elif args.dataset_file == 'e2e_tao': 33 | return build_e2e_tao(image_set, args) 34 | elif args.dataset_file == 'e2e_bddcc': 35 | return build_e2e_bddcc(image_set, args) 36 | elif args.dataset_file == 'e2e_mot': 37 | return build_e2e_mot(image_set, args) 38 | raise ValueError(f'dataset {args.dataset_file} not supported') 39 | -------------------------------------------------------------------------------- /util/checkpoint.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from pytorch-checkpoint (https://github.com/csrhddlam/pytorch-checkpoint) 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | 9 | 10 | def check_require_grad(t): 11 | return isinstance(t, torch.Tensor) and t.requires_grad 12 | 13 | 14 | class CheckpointFunction(torch.autograd.Function): 15 | @staticmethod 16 | def forward(ctx, run_function, length, *args): 17 | ctx.run_function = run_function 18 | ctx.input_tensors = list(args[:length]) 19 | ctx.input_params = list(args[length:]) 20 | with torch.no_grad(): 21 | output_tensors = ctx.run_function(*ctx.input_tensors) 22 | return output_tensors 23 | 24 | @staticmethod 25 | def backward(ctx, *output_grads): 26 | for i in range(len(ctx.input_tensors)): 27 | temp = ctx.input_tensors[i] 28 | if check_require_grad(temp): 29 | ctx.input_tensors[i] = temp.detach() 30 | ctx.input_tensors[i].requires_grad = temp.requires_grad 31 | with torch.enable_grad(): 32 | output_tensors = ctx.run_function(*ctx.input_tensors) 33 | to_autograd = list(filter(check_require_grad, ctx.input_tensors)) 34 | output_tensors, output_grads = zip(*filter(lambda t: t[0].requires_grad, zip(output_tensors, output_grads))) 35 | input_grads = torch.autograd.grad(output_tensors, to_autograd + ctx.input_params, output_grads, allow_unused=True) 36 | input_grads = list(input_grads) 37 | for i in range(len(ctx.input_tensors)): 38 | if not check_require_grad(ctx.input_tensors[i]): 39 | input_grads.insert(i, None) 40 | return (None, None) + tuple(input_grads) 41 | -------------------------------------------------------------------------------- /models/registry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Yihao Chen 3 | # @Date: 2021-08-16 16:03:17 4 | # @Last Modified by: Shilong Liu 5 | # @Last Modified time: 2022-01-23 15:26 6 | # modified from mmcv 7 | 8 | import inspect 9 | from functools import partial 10 | 11 | 12 | class Registry(object): 13 | 14 | def __init__(self, name): 15 | self._name = name 16 | self._module_dict = dict() 17 | 18 | def __repr__(self): 19 | format_str = self.__class__.__name__ + '(name={}, items={})'.format( 20 | self._name, list(self._module_dict.keys())) 21 | return format_str 22 | 23 | def __len__(self): 24 | return len(self._module_dict) 25 | 26 | @property 27 | def name(self): 28 | return self._name 29 | 30 | @property 31 | def module_dict(self): 32 | return self._module_dict 33 | 34 | def get(self, key): 35 | return self._module_dict.get(key, None) 36 | 37 | def registe_with_name(self, module_name=None, force=False): 38 | return partial(self.register, module_name=module_name, force=force) 39 | 40 | def register(self, module_build_function, module_name=None, force=False): 41 | """Register a module build function. 42 | Args: 43 | module (:obj:`nn.Module`): Module to be registered. 44 | """ 45 | if not inspect.isfunction(module_build_function): 46 | raise TypeError('module_build_function must be a function, but got {}'.format( 47 | type(module_build_function))) 48 | if module_name is None: 49 | module_name = module_build_function.__name__ 50 | if not force and module_name in self._module_dict: 51 | raise KeyError('{} is already registered in {}'.format( 52 | module_name, self.name)) 53 | self._module_dict[module_name] = module_build_function 54 | 55 | return module_build_function 56 | 57 | MODULE_BUILD_FUNCS = Registry('model build functions') 58 | 59 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /models/dino/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import json 13 | import os 14 | 15 | import util.misc as utils 16 | 17 | try: 18 | from panopticapi.evaluation import pq_compute 19 | except ImportError: 20 | pass 21 | 22 | 23 | class PanopticEvaluator(object): 24 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 25 | self.gt_json = ann_file 26 | self.gt_folder = ann_folder 27 | if utils.is_main_process(): 28 | if not os.path.exists(output_dir): 29 | os.mkdir(output_dir) 30 | self.output_dir = output_dir 31 | self.predictions = [] 32 | 33 | def update(self, predictions): 34 | for p in predictions: 35 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 36 | f.write(p.pop("png_string")) 37 | 38 | self.predictions += predictions 39 | 40 | def synchronize_between_processes(self): 41 | all_predictions = utils.all_gather(self.predictions) 42 | merged_predictions = [] 43 | for p in all_predictions: 44 | merged_predictions += p 45 | self.predictions = merged_predictions 46 | 47 | def summarize(self): 48 | if utils.is_main_process(): 49 | json_data = {"annotations": self.predictions} 50 | predictions_json = os.path.join(self.output_dir, "predictions.json") 51 | with open(predictions_json, "w") as f: 52 | f.write(json.dumps(json_data)) 53 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 54 | return None 55 | -------------------------------------------------------------------------------- /tools/merge_dance_tracklets.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | 6 | import argparse 7 | from collections import defaultdict 8 | import os 9 | from pathlib import Path 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('input_dir', type=Path) 13 | parser.add_argument('output_dir', type=Path) 14 | parser.add_argument('--t_min', default=20) 15 | parser.add_argument('--t_max', default=100) 16 | args = parser.parse_args() 17 | 18 | 19 | class FindUnionSet(dict): 20 | def find(self, src): 21 | if src in self: 22 | return self.find(self[src]) 23 | return src 24 | 25 | def merge(self, dst, src): 26 | self[self.find(src)] = self.find(dst) 27 | 28 | 29 | for seq in os.listdir(args.input_dir): 30 | print(args.input_dir / seq) 31 | with open(args.input_dir / seq) as f: 32 | lines = f.readlines() 33 | instance_timestamps = defaultdict(list) 34 | for line in lines: 35 | f_id, id = map(int, line.split(',')[:2]) 36 | instance_timestamps[id].append(f_id) 37 | instances = list(instance_timestamps.keys()) 38 | fid_map = FindUnionSet() 39 | for i in instances: 40 | for j in instances: 41 | if fid_map.find(i) == fid_map.find(j): 42 | continue 43 | end_t = max(instance_timestamps[i]) 44 | start_t = min(instance_timestamps[j]) 45 | if sum([0 <= start_t - max(pts) < args.t_max for pts in instance_timestamps.values()]) > 1: 46 | continue 47 | if sum([0 <= min(pts) - end_t < args.t_max for pts in instance_timestamps.values()]) > 1: 48 | continue 49 | dt = start_t - end_t 50 | if args.t_min < dt < args.t_max: 51 | print(f"{i}<-{j}", end_t, start_t, start_t - end_t) 52 | fid_map.merge(i, j) 53 | 54 | os.makedirs(args.output_dir / 'tracker', exist_ok=True) 55 | with open(args.output_dir / 'tracker' / seq, 'w') as f: 56 | for line in lines: 57 | f_id, id, *info = line.split(',') 58 | id = str(fid_map.find(int(id))) 59 | f.write(','.join([f_id, id, *info])) 60 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | from .deformable_transformer_plus import DeformableTransformer 11 | from .deformable_transformer_cross import DeformableTransformer as DeformableTransformerCross 12 | from .ftransformer import DetrTransformerDecoder 13 | def build_deforamble_transformer(args): 14 | arch_catalog = { 15 | 'DeformableTransformer': DeformableTransformer, 16 | 'DeformableTransformerCross': DeformableTransformerCross, 17 | } 18 | assert args.trans_mode in arch_catalog, 'invalid arch: {}'.format(args.trans_mode) 19 | build_func = arch_catalog[args.trans_mode] 20 | 21 | return build_func( 22 | d_model=args.hidden_dim, 23 | nhead=args.nheads, 24 | num_encoder_layers=args.enc_layers, 25 | num_decoder_layers=args.dec_layers, 26 | dim_feedforward=args.dim_feedforward, 27 | dropout=args.dropout, 28 | activation="relu", 29 | return_intermediate_dec=True, 30 | num_feature_levels=args.num_feature_levels, 31 | dec_n_points=args.dec_n_points, 32 | enc_n_points=args.enc_n_points, 33 | two_stage=args.two_stage, 34 | two_stage_num_proposals=args.num_queries, 35 | decoder_self_cross=not args.decoder_cross_self, 36 | sigmoid_attn=args.sigmoid_attn, 37 | extra_track_attn=args.extra_track_attn, 38 | memory_bank=args.memory_bank_type == 'MemoryBankFeat' 39 | ) 40 | 41 | 42 | from .motr import build as build_motr 43 | from .motr_uninC import build as build_motr_uninC 44 | from .motr_uninCost import build as build_motr_uninCost 45 | 46 | 47 | from .tmotr_uni import build as build_tmotr_uni 48 | 49 | def build_model(args): 50 | arch_catalog = { 51 | 'motr': build_motr, 52 | 'motr_uninc': build_motr_uninC, 53 | 'motr_unincost': build_motr_uninCost, 54 | 'tmotr_uni': build_tmotr_uni, 55 | } 56 | assert args.meta_arch in arch_catalog, 'invalid arch: {}'.format(args.meta_arch) 57 | build_func = arch_catalog[args.meta_arch] 58 | return build_func(args) 59 | 60 | 61 | -------------------------------------------------------------------------------- /models/yolo_fpn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .darknet import Darknet 9 | from .network_blocks import BaseConv 10 | 11 | 12 | class YOLOFPN(nn.Module): 13 | """ 14 | YOLOFPN module. Darknet 53 is the default backbone of this model. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | depth=53, 20 | in_features=["dark3", "dark4", "dark5"], 21 | ): 22 | super().__init__() 23 | 24 | self.backbone = Darknet(depth) 25 | self.in_features = in_features 26 | 27 | # out 1 28 | self.out1_cbl = self._make_cbl(512, 256, 1) 29 | self.out1 = self._make_embedding([256, 512], 512 + 256) 30 | 31 | # out 2 32 | self.out2_cbl = self._make_cbl(256, 128, 1) 33 | self.out2 = self._make_embedding([128, 256], 256 + 128) 34 | 35 | # upsample 36 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest") 37 | 38 | def _make_cbl(self, _in, _out, ks): 39 | return BaseConv(_in, _out, ks, stride=1, act="lrelu") 40 | 41 | def _make_embedding(self, filters_list, in_filters): 42 | m = nn.Sequential( 43 | *[ 44 | self._make_cbl(in_filters, filters_list[0], 1), 45 | self._make_cbl(filters_list[0], filters_list[1], 3), 46 | self._make_cbl(filters_list[1], filters_list[0], 1), 47 | self._make_cbl(filters_list[0], filters_list[1], 3), 48 | self._make_cbl(filters_list[1], filters_list[0], 1), 49 | ] 50 | ) 51 | return m 52 | 53 | def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"): 54 | with open(filename, "rb") as f: 55 | state_dict = torch.load(f, map_location="cpu") 56 | print("loading pretrained weights...") 57 | self.backbone.load_state_dict(state_dict) 58 | 59 | def forward(self, inputs): 60 | """ 61 | Args: 62 | inputs (Tensor): input image. 63 | 64 | Returns: 65 | Tuple[Tensor]: FPN output features.. 66 | """ 67 | # backbone 68 | out_features = self.backbone(inputs) 69 | x2, x1, x0 = [out_features[f] for f in self.in_features] 70 | 71 | # yolo branch 1 72 | x1_in = self.out1_cbl(x0) 73 | x1_in = self.upsample(x1_in) 74 | x1_in = torch.cat([x1_in, x1], 1) 75 | out_dark4 = self.out1(x1_in) 76 | 77 | # yolo branch 2 78 | x2_in = self.out2_cbl(out_dark4) 79 | x2_in = self.upsample(x2_in) 80 | x2_in = torch.cat([x2_in, x2], 1) 81 | out_dark3 = self.out2(x2_in) 82 | 83 | outputs = (out_dark3, out_dark4, x0) 84 | return outputs 85 | -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /models/dino/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | 37 | 38 | if torch.cuda.is_available() and CUDA_HOME is not None: 39 | extension = CUDAExtension 40 | sources += source_cuda 41 | define_macros += [("WITH_CUDA", None)] 42 | extra_compile_args["nvcc"] = [ 43 | "-DCUDA_HAS_FP16=1", 44 | "-D__CUDA_NO_HALF_OPERATORS__", 45 | "-D__CUDA_NO_HALF_CONVERSIONS__", 46 | "-D__CUDA_NO_HALF2_OPERATORS__", 47 | ] 48 | else: 49 | raise NotImplementedError('Cuda is not availabel') 50 | 51 | sources = [os.path.join(extensions_dir, s) for s in sources] 52 | include_dirs = [extensions_dir] 53 | ext_modules = [ 54 | extension( 55 | "MultiScaleDeformableAttention", 56 | sources, 57 | include_dirs=include_dirs, 58 | define_macros=define_macros, 59 | extra_compile_args=extra_compile_args, 60 | ) 61 | ] 62 | return ext_modules 63 | 64 | setup( 65 | name="MultiScaleDeformableAttention", 66 | version="1.0", 67 | author="Weijie Su", 68 | url="https://github.com/fundamentalvision/Deformable-DETR", 69 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 70 | packages=find_packages(exclude=("configs", "tests",)), 71 | ext_modules=get_extensions(), 72 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 73 | ) 74 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "justMyCode": true, 14 | "env": {"CUDA_VISIBLE_DEVICES":"0", "CUBLAS_WORKSPACE_CONFIG":":4096:8"}, 15 | // "args": ["--meta_arch", "motr_unincost", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "60", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "gmatch", "--g_size", "3", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/exps/motrv2ch_uni5cost3ggoon/run1/checkpoint.pth"] 16 | 17 | // "args": ["--meta_arch", "dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth"] 18 | "args": ["--meta_arch", "mot_dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048"] 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /models/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class IOUloss(nn.Module): 11 | def __init__(self, reduction="none", loss_type="iou"): 12 | super(IOUloss, self).__init__() 13 | self.reduction = reduction 14 | self.loss_type = loss_type 15 | 16 | def forward(self, pred, target): 17 | assert pred.shape[0] == target.shape[0] 18 | 19 | pred = pred.view(-1, 4) 20 | target = target.view(-1, 4) 21 | tl = torch.max( 22 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) 23 | ) 24 | br = torch.min( 25 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) 26 | ) 27 | 28 | area_p = torch.prod(pred[:, 2:], 1) 29 | area_g = torch.prod(target[:, 2:], 1) 30 | 31 | en = (tl < br).type(tl.type()).prod(dim=1) 32 | area_i = torch.prod(br - tl, 1) * en 33 | iou = (area_i) / (area_p + area_g - area_i + 1e-16) 34 | 35 | if self.loss_type == "iou": 36 | loss = 1 - iou ** 2 37 | elif self.loss_type == "giou": 38 | c_tl = torch.min( 39 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) 40 | ) 41 | c_br = torch.max( 42 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) 43 | ) 44 | area_c = torch.prod(c_br - c_tl, 1) 45 | giou = iou - (area_c - area_i) / area_c.clamp(1e-16) 46 | loss = 1 - giou.clamp(min=-1.0, max=1.0) 47 | 48 | if self.reduction == "mean": 49 | loss = loss.mean() 50 | elif self.reduction == "sum": 51 | loss = loss.sum() 52 | 53 | return loss 54 | 55 | 56 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 57 | """ 58 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 59 | Args: 60 | inputs: A float tensor of arbitrary shape. 61 | The predictions for each example. 62 | targets: A float tensor with the same shape as inputs. Stores the binary 63 | classification label for each element in inputs 64 | (0 for the negative class and 1 for the positive class). 65 | alpha: (optional) Weighting factor in range (0,1) to balance 66 | positive vs negative examples. Default = -1 (no weighting). 67 | gamma: Exponent of the modulating factor (1 - p_t) to 68 | balance easy vs hard examples. 69 | Returns: 70 | Loss tensor 71 | """ 72 | prob = inputs.sigmoid() 73 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 74 | p_t = prob * targets + (1 - prob) * (1 - targets) 75 | loss = ce_loss * ((1 - p_t) ** gamma) 76 | 77 | if alpha >= 0: 78 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 79 | loss = alpha_t * loss 80 | #return loss.mean(0).sum() / num_boxes 81 | return loss.sum() / num_boxes -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Utilities for bounding box manipulation and GIoU. 14 | """ 15 | import torch 16 | from torchvision.ops.boxes import box_area 17 | 18 | 19 | def box_cxcywh_to_xyxy(x): 20 | x_c, y_c, w, h = x.unbind(-1) 21 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 22 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 23 | return torch.stack(b, dim=-1) 24 | 25 | 26 | def box_xyxy_to_cxcywh(x): 27 | x0, y0, x1, y1 = x.unbind(-1) 28 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 29 | (x1 - x0), (y1 - y0)] 30 | return torch.stack(b, dim=-1) 31 | 32 | 33 | # modified from torchvision to also return the union 34 | def box_iou(boxes1, boxes2): 35 | area1 = box_area(boxes1) 36 | area2 = box_area(boxes2) 37 | 38 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 39 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 40 | 41 | wh = (rb - lt).clamp(min=0) # [N,M,2] 42 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 43 | 44 | union = area1[:, None] + area2 - inter 45 | 46 | iou = inter / union 47 | return iou, union 48 | 49 | 50 | def generalized_box_iou(boxes1, boxes2): 51 | """ 52 | Generalized IoU from https://giou.stanford.edu/ 53 | 54 | The boxes should be in [x0, y0, x1, y1] format 55 | 56 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 57 | and M = len(boxes2) 58 | """ 59 | # degenerate boxes gives inf / nan results 60 | # so do an early check 61 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 62 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 63 | iou, union = box_iou(boxes1, boxes2) 64 | 65 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 66 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 67 | 68 | wh = (rb - lt).clamp(min=0) # [N,M,2] 69 | area = wh[:, :, 0] * wh[:, :, 1] 70 | 71 | return iou - (area - union) / area 72 | 73 | 74 | def masks_to_boxes(masks): 75 | """Compute the bounding boxes around the provided masks 76 | 77 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 78 | 79 | Returns a [N, 4] tensors, with the boxes in xyxy format 80 | """ 81 | if masks.numel() == 0: 82 | return torch.zeros((0, 4), device=masks.device) 83 | 84 | h, w = masks.shape[-2:] 85 | 86 | y = torch.arange(0, h, dtype=torch.float) 87 | x = torch.arange(0, w, dtype=torch.float) 88 | y, x = torch.meshgrid(y, x) 89 | 90 | x_mask = (masks * x.unsqueeze(0)) 91 | x_max = x_mask.flatten(1).max(-1)[0] 92 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 93 | 94 | y_mask = (masks * y.unsqueeze(0)) 95 | y_max = y_mask.flatten(1).max(-1)[0] 96 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 97 | 98 | return torch.stack([x_min, y_min, x_max, y_max], 1) 99 | -------------------------------------------------------------------------------- /models/dino/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | import MultiScaleDeformableAttention as MSDA 22 | 23 | 24 | class MSDeformAttnFunction(Function): 25 | @staticmethod 26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 27 | ctx.im2col_step = im2col_step 28 | output = MSDA.ms_deform_attn_forward( 29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 31 | return output 32 | 33 | @staticmethod 34 | @once_differentiable 35 | def backward(ctx, grad_output): 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 37 | grad_value, grad_sampling_loc, grad_attn_weight = \ 38 | MSDA.ms_deform_attn_backward( 39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 40 | 41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 42 | 43 | 44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 45 | # for debug and test only, 46 | # need to use cuda version instead 47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel 48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2 49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value 50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1] 51 | sampling_value_list = [] 52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 57 | # N_*M_, D_, Lq_, P_ 58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 59 | mode='bilinear', padding_mode='zeros', align_corners=False) 60 | sampling_value_list.append(sampling_value_l_) 61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 64 | return output.transpose(1, 2).contiguous() 65 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-3.8/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | import MultiScaleDeformableAttention as MSDA 22 | 23 | 24 | class MSDeformAttnFunction(Function): 25 | @staticmethod 26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 27 | ctx.im2col_step = im2col_step 28 | output = MSDA.ms_deform_attn_forward( 29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 31 | return output 32 | 33 | @staticmethod 34 | @once_differentiable 35 | def backward(ctx, grad_output): 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 37 | grad_value, grad_sampling_loc, grad_attn_weight = \ 38 | MSDA.ms_deform_attn_backward( 39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 40 | 41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 42 | 43 | 44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 45 | # for debug and test only, 46 | # need to use cuda version instead 47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel 48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2 49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value 50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1] 51 | sampling_value_list = [] 52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 57 | # N_*M_, D_, Lq_, P_ 58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 59 | mode='bilinear', padding_mode='zeros', align_corners=False) 60 | sampling_value_list.append(sampling_value_l_) 61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 64 | return output.transpose(1, 2).contiguous() 65 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-37/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | import MultiScaleDeformableAttention as MSDA 22 | 23 | 24 | class MSDeformAttnFunction(Function): 25 | @staticmethod 26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 27 | ctx.im2col_step = im2col_step 28 | output = MSDA.ms_deform_attn_forward( 29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 31 | return output 32 | 33 | @staticmethod 34 | @once_differentiable 35 | def backward(ctx, grad_output): 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 37 | grad_value, grad_sampling_loc, grad_attn_weight = \ 38 | MSDA.ms_deform_attn_backward( 39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 40 | 41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 42 | 43 | 44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 45 | # for debug and test only, 46 | # need to use cuda version instead 47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel 48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2 49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value 50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1] 51 | sampling_value_list = [] 52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 57 | # N_*M_, D_, Lq_, P_ 58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 59 | mode='bilinear', padding_mode='zeros', align_corners=False) 60 | sampling_value_list.append(sampling_value_l_) 61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 64 | return output.transpose(1, 2).contiguous() 65 | -------------------------------------------------------------------------------- /tools/train_ddp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | # 打印所有指令 7 | set -x 8 | 9 | PY_ARGS=${@:2} 10 | 11 | # 脚本运行失败,报错 12 | set -o pipefail 13 | #sed -e :直接在指令列模式上進行 sed 的動作編輯; 14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g") 15 | mkdir -p $OUTPUT_BASE 16 | 17 | 18 | cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"} 19 | echo "cluster spec is $cluster_spec" 20 | worker_list_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"worker\"))" 21 | echo "worker list command is $worker_list_command" 22 | eval worker_list=`python -c "$worker_list_command"` 23 | echo "worker list is $worker_list" 24 | worker_strs=(${worker_list//,/ }) 25 | master=${worker_strs[0]} 26 | echo "master is $master" 27 | master_strs=(${master//:/ }) 28 | master_addr=${master_strs[0]} 29 | master_port=${master_strs[1]} 30 | echo "master address is $master_addr" 31 | echo "master port is $master_port" 32 | index_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"index\"))" 33 | eval node_rank=`python -c "$index_command"` 34 | echo "node rank is $node_rank" 35 | dist_url="tcp://$master_addr:$master_port" 36 | echo "dist url is $dist_url" 37 | PYTHONPATH=$PYTHONPATH:../ \ 38 | # python tools/run_net.py \ 39 | # --num_shards 8 \ 40 | # --shard_id $node_rank \ 41 | # --dist_url $dist_url \ 42 | # --cfg configs/verb/MVIT_B_32x2_CONV.yaml 43 | 44 | MASTER_ADDR=${MASTER_ADDR:-$master_addr} 45 | MASTER_PORT=${MASTER_PORT:-$master_port} 46 | NODE_RANK=${NODE_RANK:-$node_rank} 47 | # let "NNODES=GPUS/GPUS_PER_NODE" 48 | 49 | NODE_NUM=${#worker_strs[@]} 50 | echo "node num is $NODE_NUM" 51 | 52 | if ((NODE_RANK == 0)); then 53 | for RUN in $(seq 100); do 54 | ls $OUTPUT_BASE | grep run$RUN && continue 55 | OUTPUT_DIR=$OUTPUT_BASE/run$RUN 56 | mkdir $OUTPUT_DIR && break 57 | done 58 | 59 | # clean up *.pyc files 60 | rmpyc() { 61 | rm -rf $(find -name __pycache__) 62 | rm -rf $(find -name "*.pyc") 63 | } 64 | 65 | # run backup 66 | echo "Backing up to log dir: $OUTPUT_DIR" 67 | rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR 68 | echo " ...Done" 69 | 70 | # tar src to avoid future editing 71 | cleanup() { 72 | echo "Packing source code" 73 | rmpyc 74 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files 75 | echo " ...Done" 76 | } 77 | 78 | pushd $OUTPUT_DIR 79 | trap cleanup EXIT 80 | 81 | # log git status 82 | echo "Logging git status" 83 | git status > git_status 84 | git rev-parse HEAD > git_tag 85 | git diff > git_diff 86 | echo $PY_ARGS > desc 87 | echo " ...Done" 88 | 89 | else 90 | # 3 minutes 91 | sleep 180 92 | for RUN in $(seq 100); do 93 | ls $OUTPUT_BASE | grep run$RUN && continue 94 | let "ITERRUN=$RUN-1" 95 | OUTPUT_DIR=$OUTPUT_BASE/run$ITERRUN 96 | break 97 | done 98 | fi 99 | 100 | args=$(cat $1) 101 | 102 | # python -m torch.distributed.launch --nproc_per_node=8 --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR 103 | 104 | # python ./util/launch.py \ 105 | # --nnodes 2 \ 106 | # --node_rank ${NODE_RANK} \ 107 | # --master_addr ${MASTER_ADDR} \ 108 | # --master_port 29502 \ 109 | # --nproc_per_node 8 \ 110 | # python main.py "${args} --output_dir $OUTPUT_DIR" 111 | python -m torch.distributed.launch --nproc_per_node=8 --nnodes ${NODE_NUM} --node_rank ${NODE_RANK} --master_addr=${MASTER_ADDR} --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR 112 | -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-37/build.ninja: -------------------------------------------------------------------------------- 1 | ninja_required_version = 1.3 2 | cxx = c++ 3 | nvcc = /usr/local/cuda/bin/nvcc 4 | 5 | cflags = -pthread -B /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/software/anaconda3/envs/detr/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c 6 | post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 7 | cuda_cflags = -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c 8 | cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_70,code=sm_70 -std=c++14 9 | ldflags = 10 | 11 | rule compile 12 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags 13 | depfile = $out.d 14 | deps = gcc 15 | 16 | rule cuda_compile 17 | command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags 18 | 19 | 20 | 21 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp 22 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o: cuda_compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu 23 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /models/yolo_pafpn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .darknet import CSPDarknet 9 | from .network_blocks import BaseConv, CSPLayer, DWConv 10 | 11 | 12 | class YOLOPAFPN(nn.Module): 13 | """ 14 | YOLOv3 model. Darknet 53 is the default backbone of this model. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | depth=1.0, 20 | width=1.0, 21 | in_features=("dark3", "dark4", "dark5"), 22 | in_channels=[256, 512, 1024], 23 | depthwise=False, 24 | act="silu", 25 | ): 26 | super().__init__() 27 | self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act) 28 | self.in_features = in_features 29 | self.in_channels = in_channels 30 | Conv = DWConv if depthwise else BaseConv 31 | 32 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest") 33 | self.lateral_conv0 = BaseConv( 34 | int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act 35 | ) 36 | self.C3_p4 = CSPLayer( 37 | int(2 * in_channels[1] * width), 38 | int(in_channels[1] * width), 39 | round(3 * depth), 40 | False, 41 | depthwise=depthwise, 42 | act=act, 43 | ) # cat 44 | 45 | self.reduce_conv1 = BaseConv( 46 | int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act 47 | ) 48 | self.C3_p3 = CSPLayer( 49 | int(2 * in_channels[0] * width), 50 | int(in_channels[0] * width), 51 | round(3 * depth), 52 | False, 53 | depthwise=depthwise, 54 | act=act, 55 | ) 56 | 57 | # bottom-up conv 58 | self.bu_conv2 = Conv( 59 | int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act 60 | ) 61 | self.C3_n3 = CSPLayer( 62 | int(2 * in_channels[0] * width), 63 | int(in_channels[1] * width), 64 | round(3 * depth), 65 | False, 66 | depthwise=depthwise, 67 | act=act, 68 | ) 69 | 70 | # bottom-up conv 71 | self.bu_conv1 = Conv( 72 | int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act 73 | ) 74 | self.C3_n4 = CSPLayer( 75 | int(2 * in_channels[1] * width), 76 | int(in_channels[2] * width), 77 | round(3 * depth), 78 | False, 79 | depthwise=depthwise, 80 | act=act, 81 | ) 82 | 83 | def forward(self, input): 84 | """ 85 | Args: 86 | inputs: input images. 87 | 88 | Returns: 89 | Tuple[Tensor]: FPN feature. 90 | """ 91 | 92 | # backbone 93 | out_features = self.backbone(input) 94 | features = [out_features[f] for f in self.in_features] 95 | [x2, x1, x0] = features 96 | 97 | fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 98 | f_out0 = self.upsample(fpn_out0) # 512/16 99 | f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 100 | f_out0 = self.C3_p4(f_out0) # 1024->512/16 101 | 102 | fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 103 | f_out1 = self.upsample(fpn_out1) # 256/8 104 | f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 105 | pan_out2 = self.C3_p3(f_out1) # 512->256/8 106 | 107 | p_out1 = self.bu_conv2(pan_out2) # 256->256/16 108 | p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 109 | pan_out1 = self.C3_n3(p_out1) # 512->512/16 110 | 111 | p_out0 = self.bu_conv1(pan_out1) # 512->512/32 112 | p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 113 | pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 114 | 115 | outputs = (pan_out2, pan_out1, pan_out0) 116 | return outputs 117 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Various positional encodings for the transformer. 14 | """ 15 | import math 16 | import torch 17 | from torch import nn 18 | 19 | from util.misc import NestedTensor 20 | 21 | 22 | class PositionEmbeddingSine(nn.Module): 23 | """ 24 | This is a more standard version of the position embedding, very similar to the one 25 | used by the Attention is all you need paper, generalized to work on images. 26 | """ 27 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 28 | super().__init__() 29 | self.num_pos_feats = num_pos_feats 30 | self.temperature = temperature 31 | self.normalize = normalize 32 | if scale is not None and normalize is False: 33 | raise ValueError("normalize should be True if scale is passed") 34 | if scale is None: 35 | scale = 2 * math.pi 36 | self.scale = scale 37 | 38 | def forward(self, tensor_list: NestedTensor): 39 | x = tensor_list.tensors 40 | mask = tensor_list.mask 41 | assert mask is not None 42 | not_mask = ~mask 43 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 44 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 45 | if self.normalize: 46 | eps = 1e-6 47 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 48 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 49 | 50 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 51 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 52 | 53 | pos_x = x_embed[:, :, :, None] / dim_t 54 | pos_y = y_embed[:, :, :, None] / dim_t 55 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 56 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 57 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 58 | return pos 59 | 60 | 61 | class PositionEmbeddingLearned(nn.Module): 62 | """ 63 | Absolute pos embedding, learned. 64 | """ 65 | def __init__(self, num_pos_feats=256): 66 | super().__init__() 67 | self.row_embed = nn.Embedding(50, num_pos_feats) 68 | self.col_embed = nn.Embedding(50, num_pos_feats) 69 | self.reset_parameters() 70 | 71 | def reset_parameters(self): 72 | nn.init.uniform_(self.row_embed.weight) 73 | nn.init.uniform_(self.col_embed.weight) 74 | 75 | def forward(self, tensor_list: NestedTensor): 76 | x = tensor_list.tensors 77 | h, w = x.shape[-2:] 78 | i = torch.arange(w, device=x.device) 79 | j = torch.arange(h, device=x.device) 80 | x_emb = self.col_embed(i) 81 | y_emb = self.row_embed(j) 82 | pos = torch.cat([ 83 | x_emb.unsqueeze(0).repeat(h, 1, 1), 84 | y_emb.unsqueeze(1).repeat(1, w, 1), 85 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 86 | return pos 87 | 88 | 89 | def build_position_encoding(args): 90 | N_steps = args.hidden_dim // 2 91 | if args.position_embedding in ('v2', 'sine'): 92 | # TODO find a better way of exposing other arguments 93 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 94 | elif args.position_embedding in ('v3', 'learned'): 95 | position_embedding = PositionEmbeddingLearned(N_steps) 96 | else: 97 | raise ValueError(f"not supported {args.position_embedding}") 98 | 99 | return position_embedding 100 | -------------------------------------------------------------------------------- /models/dino/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D, requires_grad=True).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2, requires_grad=True).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P, requires_grad=True).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | 55 | value = torch.autograd.Variable(value.data, requires_grad=True) 56 | sampling_locations = torch.autograd.Variable(sampling_locations.data, requires_grad=True) 57 | attention_weights = torch.autograd.Variable(attention_weights.data, requires_grad=True) 58 | 59 | t0 = time.time() 60 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).cpu() 61 | print( time.time()-t0) 62 | t0 = time.time() 63 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 64 | print( time.time()-t0) 65 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 66 | max_abs_err = (output_cuda - output_pytorch).abs().max() 67 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 68 | 69 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 70 | 71 | 72 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 73 | 74 | value = torch.rand(N, S, M, channels).cuda() * 0.01 75 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 76 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 77 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 78 | im2col_step = 2 79 | func = MSDeformAttnFunction.apply 80 | 81 | value.requires_grad = grad_value 82 | sampling_locations.requires_grad = grad_sampling_loc 83 | attention_weights.requires_grad = grad_attn_weight 84 | 85 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 86 | 87 | print(f'* {gradok} check_gradient_numerical(D={channels})') 88 | 89 | 90 | if __name__ == '__main__': 91 | check_forward_equal_with_pytorch_double() 92 | check_forward_equal_with_pytorch_float() 93 | 94 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 95 | check_gradient_numerical(channels, True, True, True) 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import torch 13 | from functools import partial 14 | from models.structures import Instances 15 | 16 | def to_cuda(samples, targets, device): 17 | samples = samples.to(device, non_blocking=True) 18 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 19 | return samples, targets 20 | 21 | 22 | def tensor_to_cuda(tensor: torch.Tensor, device): 23 | return tensor.to(device) 24 | 25 | 26 | def is_tensor_or_instances(data): 27 | return isinstance(data, torch.Tensor) or isinstance(data, Instances) 28 | 29 | 30 | def data_apply(data, check_func, apply_func): 31 | if isinstance(data, dict): 32 | for k in data.keys(): 33 | if check_func(data[k]): 34 | data[k] = apply_func(data[k]) 35 | elif isinstance(data[k], dict) or isinstance(data[k], list): 36 | data_apply(data[k], check_func, apply_func) 37 | else: 38 | raise ValueError() 39 | elif isinstance(data, list): 40 | for i in range(len(data)): 41 | if check_func(data[i]): 42 | data[i] = apply_func(data[i]) 43 | elif isinstance(data[i], dict) or isinstance(data[i], list): 44 | data_apply(data[i], check_func, apply_func) 45 | else: 46 | raise ValueError("invalid type {}".format(type(data[i]))) 47 | else: 48 | raise ValueError("invalid type {}".format(type(data))) 49 | return data 50 | 51 | 52 | def data_dict_to_cuda(data_dict, device): 53 | return data_apply(data_dict, is_tensor_or_instances, partial(tensor_to_cuda, device=device)) 54 | 55 | 56 | class data_prefetcher(): 57 | def __init__(self, loader, device, prefetch=True): 58 | self.loader = iter(loader) 59 | self.prefetch = prefetch 60 | self.device = device 61 | if prefetch: 62 | self.stream = torch.cuda.Stream() 63 | self.preload() 64 | 65 | def preload(self): 66 | try: 67 | self.next_samples, self.next_targets = next(self.loader) 68 | except StopIteration: 69 | self.next_samples = None 70 | self.next_targets = None 71 | return 72 | # if record_stream() doesn't work, another option is to make sure device inputs are created 73 | # on the main stream. 74 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 75 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 76 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 77 | # at the time we start copying to next_*: 78 | # self.stream.wait_stream(torch.cuda.current_stream()) 79 | with torch.cuda.stream(self.stream): 80 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 81 | # more code for the alternative if record_stream() doesn't work: 82 | # copy_ will record the use of the pinned source tensor in this side stream. 83 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 84 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 85 | # self.next_input = self.next_input_gpu 86 | # self.next_target = self.next_target_gpu 87 | 88 | # With Amp, it isn't necessary to manually convert data to half. 89 | # if args.fp16: 90 | # self.next_input = self.next_input.half() 91 | # else: 92 | 93 | def next(self): 94 | if self.prefetch: 95 | torch.cuda.current_stream().wait_stream(self.stream) 96 | samples = self.next_samples 97 | targets = self.next_targets 98 | if samples is not None: 99 | samples.record_stream(torch.cuda.current_stream()) 100 | if targets is not None: 101 | for t in targets: 102 | for k, v in t.items(): 103 | v.record_stream(torch.cuda.current_stream()) 104 | self.preload() 105 | else: 106 | try: 107 | samples, targets = next(self.loader) 108 | samples, targets = to_cuda(samples, targets, self.device) 109 | except StopIteration: 110 | print("catch_stop_iter") 111 | samples = None 112 | targets = None 113 | 114 | return samples, targets 115 | -------------------------------------------------------------------------------- /util/tool.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | import torch 12 | import copy 13 | import numpy as np 14 | import collections 15 | 16 | def load_model(model, model_path, optimizer=None, resume=False, 17 | lr=None, lr_step=None): 18 | start_epoch = 0 19 | checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) 20 | print(f'loaded {model_path}') 21 | state_dict = checkpoint['model'] 22 | model_state_dict = model.state_dict() 23 | 24 | # check loaded parameters and created model parameters 25 | msg = 'If you see this, your model does not fully load the ' + \ 26 | 'pre-trained weight. Please make sure ' + \ 27 | 'you set the correct --num_classes for your own dataset.' 28 | state_dict_old = copy.deepcopy(state_dict) 29 | for k in state_dict_old: 30 | if k in model_state_dict: 31 | if state_dict[k].shape != model_state_dict[k].shape: 32 | print('Skip loading parameter {}, required shape{}, ' \ 33 | 'loaded shape{}. {}'.format( 34 | k, model_state_dict[k].shape, state_dict[k].shape, msg)) 35 | if 'class_embed' in k: 36 | print("load class_embed: {} shape={}".format(k, state_dict[k].shape)) 37 | if model_state_dict[k].shape[0] == 1: 38 | state_dict[k] = state_dict[k][1:2] 39 | elif model_state_dict[k].shape[0] == 2: 40 | state_dict[k] = state_dict[k][1:3] 41 | elif model_state_dict[k].shape[0] == 3: 42 | state_dict[k] = state_dict[k][1:4] 43 | elif model_state_dict[k].shape[0] == 11: 44 | state_dict[k] = state_dict[k][1:12] 45 | elif model_state_dict[k].shape[0] == 100: 46 | state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]] 47 | elif model_state_dict[k].shape[0] == 91 and state_dict[k].shape[0] == 1: 48 | state_dict[k] = state_dict[k].repeat_interleave(91, dim=0) 49 | elif model_state_dict[k].shape[0] == 2000: 50 | state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]] 51 | else: 52 | raise NotImplementedError('invalid shape: {}'.format(model_state_dict[k].shape)) 53 | continue 54 | state_dict[k] = model_state_dict[k] 55 | elif k.replace('in_proj_weight', 'in_proj.weight') in model_state_dict: 56 | k_dst = k.replace('in_proj_weight', 'in_proj.weight') 57 | print('{}->{}'.format(k, k_dst)) 58 | state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()]) 59 | elif k.replace('in_proj_bias', 'in_proj.bias') in model_state_dict: 60 | k_dst = k.replace('in_proj_bias', 'in_proj.bias') 61 | print('{}->{}'.format(k, k_dst)) 62 | state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()]) 63 | elif 'transformer.decoder.layers' in k and 'self_attn.in_proj' in k: 64 | k_dst_q = k.replace('in_proj_', 'in_proj_q.') 65 | k_dst_k = k.replace('in_proj_', 'in_proj_k.') 66 | k_dst_v = k.replace('in_proj_', 'in_proj_v.') 67 | print('{}->({},{},{})'.format(k, k_dst_q, k_dst_k, k_dst_v)) 68 | state_dict[k_dst_q], state_dict[k_dst_k], state_dict[k_dst_v] = torch.chunk(state_dict[k], 3, dim=0) 69 | else: 70 | print('Drop parameter {}.'.format(k) + msg) 71 | for k in model_state_dict: 72 | if not (k in state_dict): # pretrain model 73 | if 'decoder_two' in k: 74 | state_dict[k] = state_dict[k.replace('.decoder_two.', '.decoder.')] 75 | elif '_embed_two' in k: 76 | state_dict[k] = state_dict[k.replace('_embed_two.', '_embed.')] 77 | else: 78 | print('No param {}.'.format(k) + msg) 79 | state_dict[k] = model_state_dict[k] 80 | model.load_state_dict(state_dict, strict=False) 81 | 82 | # resume optimizer parameters 83 | if optimizer is not None and resume: 84 | if 'optimizer' in checkpoint: 85 | optimizer.load_state_dict(checkpoint['optimizer']) 86 | start_epoch = checkpoint['epoch'] 87 | start_lr = lr 88 | for step in lr_step: 89 | if start_epoch >= step: 90 | start_lr *= 0.1 91 | for param_group in optimizer.param_groups: 92 | param_group['lr'] = start_lr 93 | print('Resumed optimizer with start lr', start_lr) 94 | else: 95 | print('No optimizer parameters in checkpoint.') 96 | if optimizer is not None: 97 | return model, optimizer, start_epoch 98 | else: 99 | return model 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /tools/similarity_analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from collections import defaultdict 4 | from sklearn.decomposition import PCA 5 | 6 | 7 | # 计算两个box的IOU 8 | def bboxes_iou(bboxes1,bboxes2): 9 | bboxes1 = np.transpose(bboxes1) 10 | bboxes2 = np.transpose(bboxes2) 11 | 12 | # 计算两个box的交集:交集左上角的点取两个box的max,交集右下角的点取两个box的min 13 | int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0]) 14 | int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1]) 15 | int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2]) 16 | int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3]) 17 | 18 | # 计算两个box交集的wh:如果两个box没有交集,那么wh为0(按照计算方式wh为负数,跟0比较取最大值) 19 | int_h = np.maximum(int_ymax-int_ymin,0.) 20 | int_w = np.maximum(int_xmax-int_xmin,0.) 21 | 22 | # 计算IOU 23 | int_vol = int_h * int_w # 交集面积 24 | vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积 25 | vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积 26 | IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集 27 | return IOU 28 | 29 | 30 | 31 | root_data = 'tmp' 32 | 33 | # det2trk_weight = defaultdict(list) 34 | # trk2trk_weight = defaultdict(list) 35 | # detall2trk_weight = defaultdict(list) 36 | # for i in range(703): 37 | # print(i) 38 | # for j in range(6): 39 | 40 | # bboxes = np.load(os.path.join(root_data, 'box_%08d_%d.txt.npy'%(i,j)))[0] 41 | # classes = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,j)))[0, :, 0] 42 | # weights = np.load(os.path.join(root_data, 'weight_%08d_%d.txt.npy'%(i,j))) 43 | 44 | # bboxes[:, [0,1]] -= bboxes[:, [2,3]]/2 45 | # bboxes[:, [2,3]] += bboxes[:, [0,1]] 46 | 47 | # indexes = np.where(classes>0)[0] 48 | 49 | # det_indexes = indexes[indexes<60] 50 | # trk_indexes = indexes[indexes>=60] 51 | 52 | # iou = bboxes_iou(bboxes[trk_indexes], bboxes[det_indexes]) 53 | # if len(trk_indexes) and len(det_indexes): 54 | # pair_idx = iou.argmax(-1) 55 | # pair_val = iou.max(-1) 56 | # pair_trk_idx = trk_indexes[pair_val>0.7] 57 | # pair_det_idx = det_indexes[pair_idx[pair_val>0.7]] 58 | # if len(pair_trk_idx) and len(pair_det_idx): 59 | # if weights[pair_trk_idx, pair_det_idx].mean() < 1: 60 | # det2trk_weight[j].append(weights[pair_trk_idx, pair_det_idx].mean()) 61 | # else: 62 | # print("1") 63 | # if weights[pair_trk_idx, pair_trk_idx].mean() < 1: 64 | # trk2trk_weight[j].append(weights[pair_trk_idx, pair_trk_idx].mean()) 65 | # else: 66 | # print("1") 67 | # if weights[pair_trk_idx, :60].sum(-1).mean() < 1: 68 | # detall2trk_weight[j].append(weights[pair_trk_idx, :60].sum(-1).mean()) 69 | # else: 70 | # print("1") 71 | 72 | # print(np.array(det2trk_weight[0]).mean(), np.array(det2trk_weight[1]).mean(), np.array(det2trk_weight[2]).mean(), np.array(det2trk_weight[3]).mean(), np.array(det2trk_weight[4]).mean(), np.array(det2trk_weight[5]).mean()) 73 | # print(np.array(trk2trk_weight[0]).mean(), np.array(trk2trk_weight[1]).mean(), np.array(trk2trk_weight[2]).mean(), np.array(trk2trk_weight[3]).mean(), np.array(trk2trk_weight[4]).mean(), np.array(trk2trk_weight[5]).mean()) 74 | # print(np.array(detall2trk_weight[0]).mean(), np.array(detall2trk_weight[1]).mean(), np.array(detall2trk_weight[2]).mean(), np.array(detall2trk_weight[3]).mean(), np.array(detall2trk_weight[4]).mean(), np.array(detall2trk_weight[5]).mean()) 75 | 76 | hs_all = defaultdict(list) 77 | hs_all_flatten = [] 78 | for i in range(703): 79 | scores = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,5)))[0, :, 0] 80 | ids = np.load(os.path.join(root_data, 'ids_%08d.txt.npy'%(i)))[scores>0] 81 | hs = np.load(os.path.join(root_data, 'hs_%08d.txt.npy'%(i)))[scores>0] 82 | 83 | for id, h in zip(ids, hs): 84 | hs_all[id].append(h) 85 | hs_all_flatten.append(h) 86 | 87 | pca = PCA(n_components=2) 88 | # newX = pca.fit_transform(X) 89 | pca.fit(hs_all_flatten) 90 | pca.transform(X) 91 | 92 | 93 | 94 | 95 | 96 | stat_scores_det = defaultdict(lambda: defaultdict(int)) 97 | for line in np.loadtxt('tmp_det.txt'): 98 | stat_scores_det[int(line[0])][int(line[1])] = line[2] 99 | stat_scores_trk = defaultdict(lambda: defaultdict(int)) 100 | for line in np.loadtxt('tmp_trk.txt'): 101 | stat_scores_trk[int(line[0])][int(line[1])] = line[2] 102 | stat_scores_uni_det = defaultdict(lambda: defaultdict(int)) 103 | for line in np.loadtxt('tmp_uni_det.txt'): 104 | stat_scores_uni_det[int(line[0])][int(line[1])] = line[2] 105 | stat_scores_uni_trk = defaultdict(lambda: defaultdict(int)) 106 | for line in np.loadtxt('tmp_uni_trk.txt'): 107 | stat_scores_uni_trk[int(line[0])][int(line[1])] = line[2] 108 | 109 | 110 | count_bin_all = defaultdict(list) 111 | count_bin = defaultdict(int) 112 | for framid in stat_scores_trk: 113 | for obj_id in stat_scores_trk[framid]: 114 | if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]: 115 | count_bin_all[int(stat_scores_trk[framid][obj_id]*10)].append(stat_scores_uni_trk[framid][obj_id]-stat_scores_trk[framid][obj_id]) 116 | if stat_scores_trk[framid][obj_id] > stat_scores_uni_trk[framid][obj_id]: 117 | count_bin[int(stat_scores_trk[framid][obj_id]*10)] -= 1 118 | else: 119 | count_bin[int(stat_scores_trk[framid][obj_id]*10)] += 1 120 | for i in range(10): 121 | print(np.array(count_bin_all[i]).mean(), np.array(count_bin_all[i]).std()) 122 | 123 | 124 | with open('tmp.txt', 'w') as fp: 125 | for framid in stat_scores_trk: 126 | for obj_id in stat_scores_trk[framid]: 127 | if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]: 128 | # print(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id]) 129 | fp.write('%f %f\n'%(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id])) 130 | -------------------------------------------------------------------------------- /tools/visualize_tao.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | 6 | from collections import defaultdict 7 | from glob import glob 8 | import json 9 | import os 10 | import cv2 11 | import numpy as np 12 | import subprocess 13 | import random 14 | from tqdm import tqdm 15 | from PIL import Image, ImageDraw 16 | 17 | from scipy.optimize import linear_sum_assignment as linear_assignment 18 | 19 | # 计算两个box的IOU 20 | def bboxes_iou(bboxes1,bboxes2): 21 | bboxes1 = np.transpose(bboxes1) 22 | bboxes2 = np.transpose(bboxes2) 23 | 24 | # 计算两个box的交集:交集左上角的点取两个box的max,交集右下角的点取两个box的min 25 | int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0]) 26 | int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1]) 27 | int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2]) 28 | int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3]) 29 | 30 | # 计算两个box交集的wh:如果两个box没有交集,那么wh为0(按照计算方式wh为负数,跟0比较取最大值) 31 | int_h = np.maximum(int_ymax-int_ymin,0.) 32 | int_w = np.maximum(int_xmax-int_xmin,0.) 33 | 34 | # 计算IOU 35 | int_vol = int_h * int_w # 交集面积 36 | vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积 37 | vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积 38 | IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集 39 | return IOU 40 | 41 | def get_color(i): 42 | return [(i * 23 * j + 43) % 255 for j in range(3)] 43 | 44 | 45 | def show_gt(img_list, output="output.mp4"): 46 | h, w, _ = cv2.imread(img_list[0]).shape 47 | command = [ 48 | "anaconda3/envs/detrex/bin/ffmpeg", 49 | '-y', # overwrite output file if it exists 50 | '-f', 'rawvideo', 51 | '-vcodec','rawvideo', 52 | '-s', f'{w}x{h}', # size of one frame 53 | '-pix_fmt', 'bgr24', 54 | '-r', '20', # frames per second 55 | '-i', '-', # The imput comes from a pipe 56 | '-s', f'{w//2*2}x{h//2*2}', 57 | '-an', # Tells FFMPEG not to expect any audio 58 | '-loglevel', 'error', 59 | # '-crf', '26', 60 | '-b:v', '0', 61 | '-pix_fmt', 'yuv420p' 62 | ] 63 | # writing_process = subprocess.Popen(command + [output], stdin=subprocess.PIPE) 64 | fps = 16 65 | size = (w,h) 66 | videowriter = cv2.VideoWriter(output,cv2.VideoWriter_fourcc('M','J','P','G'), fps, size) 67 | 68 | 69 | for i, path in enumerate(tqdm(sorted(img_list))): 70 | im = cv2.imread(path) 71 | det_bboxes = [] 72 | motr_bboxes = [] 73 | for det in det_db[path.replace('data/', '').replace('.jpg', '.txt').replace('dancetrack/', 'DanceTrack/')]: 74 | x1, y1, w, h, s = map(float, det.strip().split(',')) 75 | x1, y1, w, h = map(int, [x1, y1, w, h]) 76 | im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2) 77 | im = cv2.putText(im, '%0.2f'%s, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1) 78 | det_bboxes.append([x1, y1, x1+w, y1+h]) 79 | 80 | det_bboxes = np.array(det_bboxes) 81 | motr_bboxes = np.array(motr_bboxes) 82 | ious = bboxes_iou(det_bboxes, motr_bboxes) 83 | matching = linear_assignment(-ious) 84 | matched = sum(ious[matching[0], matching[1]] > 0.5) 85 | im = cv2.putText(im, f"{matched}/{len(det_bboxes)}/{len(motr_bboxes)}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, get_color(j), 3) 86 | cv2.putText(im, "{}".format(os.path.basename(path)[:-4]), (120,120), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 6) 87 | # writing_process.stdin.write(im.tobytes()) 88 | videowriter.write(im) 89 | 90 | videowriter.release() 91 | 92 | 93 | if __name__ == '__main__': 94 | 95 | labels_full = defaultdict(lambda : defaultdict(list)) 96 | imgid2name = defaultdict() 97 | def _add_mot_folder(mot_path, split_dir): 98 | print("Adding", split_dir) 99 | labels = json.load(open(os.path.join(mot_path, split_dir))) 100 | for ann in labels['images']: 101 | imgid2name[ann['id']] = ann['file_name'] 102 | for ann in labels['annotations']: 103 | vid = ann['video_id'] 104 | t = ann['image_id'] 105 | x, y, w, h = ann['bbox'] 106 | i = ann['track_id'] 107 | crowd = ann['iscrowd'] 108 | cl = ann['category_id'] 109 | labels_full[vid][t].append([x, y, w, h, i, crowd, cl]) 110 | return labels_full, imgid2name 111 | 112 | mot_path = 'data/' 113 | labels_full, imgid2name = _add_mot_folder(mot_path, 'tao/annotations/train.json') 114 | indices = [] 115 | vid_files = list(labels_full.keys()) 116 | for vid in vid_files: 117 | t_min = min(labels_full[vid].keys()) 118 | t_max = max(labels_full[vid].keys()) + 1 119 | for t in range(t_min, t_max): 120 | indices.append((vid, t)) 121 | 122 | vid_old = None 123 | random.shuffle(vid_files) 124 | videowriter = None 125 | for vid in vid_files: 126 | print(vid) 127 | t_min = min(labels_full[vid].keys()) 128 | t_max = max(labels_full[vid].keys()) + 1 129 | for idx in range(t_min, t_max): 130 | # vid, idx = indices[idx] 131 | img_path = os.path.join(mot_path, 'tao/frames', imgid2name[idx]) 132 | img = Image.open(img_path) 133 | if vid != vid_old: 134 | vid_old = vid 135 | w, h = img._size 136 | fps = 1 137 | size = (w,h) 138 | if videowriter is not None: 139 | videowriter.release() 140 | videowriter = cv2.VideoWriter('tmp/'+imgid2name[idx].split('/')[-2]+'.avi',cv2.VideoWriter_fourcc('M','J','P','G'), fps, size) 141 | im = np.array(img) 142 | for *xywh, id, crowd, cl in labels_full[vid][idx]: 143 | x1, y1, w, h = xywh 144 | x1, y1, w, h = map(int, [x1, y1, w, h]) 145 | im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2) 146 | im = cv2.putText(im, '%d'%id, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1) 147 | videowriter.write(im) 148 | 149 | videowriter.release() 150 | -------------------------------------------------------------------------------- /models/memory_bank.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn, Tensor 8 | 9 | from typing import List 10 | 11 | from models.structures import Instances 12 | 13 | 14 | class MemoryBank(nn.Module): 15 | def __init__(self, args, dim_in, hidden_dim, dim_out): 16 | super().__init__() 17 | self._build_layers(args, dim_in, hidden_dim, dim_out) 18 | for p in self.parameters(): 19 | if p.dim() > 1: 20 | nn.init.xavier_uniform_(p) 21 | 22 | def _build_layers(self, args, dim_in, hidden_dim, dim_out): 23 | self.save_thresh = args.memory_bank_score_thresh 24 | self.save_period = 3 25 | self.max_his_length = args.memory_bank_len 26 | 27 | self.save_proj = nn.Linear(dim_in, dim_in) 28 | 29 | self.temporal_attn = nn.MultiheadAttention(dim_in, 8, dropout=0) 30 | self.temporal_fc1 = nn.Linear(dim_in, hidden_dim) 31 | self.temporal_fc2 = nn.Linear(hidden_dim, dim_in) 32 | self.temporal_norm1 = nn.LayerNorm(dim_in) 33 | self.temporal_norm2 = nn.LayerNorm(dim_in) 34 | 35 | self.track_cls = nn.Linear(dim_in, 1) 36 | 37 | self.self_attn = None 38 | if args.memory_bank_with_self_attn: 39 | self.spatial_attn = nn.MultiheadAttention(dim_in, 8, dropout=0) 40 | self.spatial_fc1 = nn.Linear(dim_in, hidden_dim) 41 | self.spatial_fc2 = nn.Linear(hidden_dim, dim_in) 42 | self.spatial_norm1 = nn.LayerNorm(dim_in) 43 | self.spatial_norm2 = nn.LayerNorm(dim_in) 44 | else: 45 | self.spatial_attn = None 46 | 47 | def update(self, track_instances): 48 | embed = track_instances.output_embedding[:, None] #( N, 1, 256) 49 | scores = track_instances.scores 50 | mem_padding_mask = track_instances.mem_padding_mask 51 | device = embed.device 52 | 53 | save_period = track_instances.save_period 54 | if self.training: 55 | saved_idxes = scores > 0 56 | else: 57 | saved_idxes = (save_period == 0) & (scores > self.save_thresh) 58 | # saved_idxes = (save_period == 0) 59 | save_period[save_period > 0] -= 1 60 | save_period[saved_idxes] = self.save_period 61 | 62 | saved_embed = embed[saved_idxes] 63 | if len(saved_embed) > 0: 64 | prev_embed = track_instances.mem_bank[saved_idxes] 65 | save_embed = self.save_proj(saved_embed) 66 | mem_padding_mask[saved_idxes] = torch.cat([mem_padding_mask[saved_idxes, 1:], torch.zeros((len(saved_embed), 1), dtype=torch.bool, device=device)], dim=1) 67 | track_instances.mem_bank = track_instances.mem_bank.clone() 68 | track_instances.mem_bank[saved_idxes] = torch.cat([prev_embed[:, 1:], save_embed], dim=1) 69 | 70 | def _forward_spatial_attn(self, track_instances): 71 | if len(track_instances) == 0: 72 | return track_instances 73 | 74 | embed = track_instances.output_embedding 75 | dim = embed.shape[-1] 76 | query_pos = track_instances.query_pos[:, :dim] # 应该为query_pos = pos2posemb(track_instances.ref_pts) 77 | k = q = (embed + query_pos) 78 | v = embed 79 | embed2 = self.spatial_attn( 80 | q[:, None], 81 | k[:, None], 82 | v[:, None] 83 | )[0][:, 0] 84 | embed = self.spatial_norm1(embed + embed2) 85 | embed2 = self.spatial_fc2(F.relu(self.spatial_fc1(embed))) 86 | embed = self.spatial_norm2(embed + embed2) 87 | track_instances.output_embedding = embed 88 | return track_instances 89 | 90 | def _forward_track_cls(self, track_instances): 91 | track_instances.track_scores = self.track_cls(track_instances.output_embedding)[..., 0] 92 | return track_instances 93 | 94 | def _forward_temporal_attn(self, track_instances): 95 | if len(track_instances) == 0: 96 | return track_instances 97 | 98 | dim = track_instances.query_pos.shape[1] 99 | key_padding_mask = track_instances.mem_padding_mask 100 | 101 | valid_idxes = key_padding_mask[:, -1] == 0 102 | embed = track_instances.output_embedding[valid_idxes] # (n, 256) 103 | 104 | if len(embed) > 0: 105 | prev_embed = track_instances.mem_bank[valid_idxes] 106 | key_padding_mask = key_padding_mask[valid_idxes] 107 | embed2 = self.temporal_attn( 108 | embed[None], # (num_track, dim) to (1, num_track, dim) 109 | prev_embed.transpose(0, 1), # (num_track, mem_len, dim) to (mem_len, num_track, dim) 110 | prev_embed.transpose(0, 1), 111 | key_padding_mask=key_padding_mask, 112 | )[0][0] 113 | 114 | embed = self.temporal_norm1(embed + embed2) 115 | embed2 = self.temporal_fc2(F.relu(self.temporal_fc1(embed))) 116 | embed = self.temporal_norm2(embed + embed2) 117 | track_instances.output_embedding = track_instances.output_embedding.clone() 118 | track_instances.output_embedding[valid_idxes] = embed 119 | 120 | return track_instances 121 | 122 | def forward_temporal_attn(self, track_instances): 123 | return self._forward_temporal_attn(track_instances) 124 | 125 | def forward(self, track_instances: Instances, update_bank=True) -> Instances: 126 | track_instances = self._forward_temporal_attn(track_instances) 127 | if update_bank: 128 | self.update(track_instances) 129 | if self.spatial_attn is not None: 130 | track_instances = self._forward_spatial_attn(track_instances) 131 | if self.track_cls is not None: 132 | track_instances = self._forward_track_cls(track_instances) 133 | return track_instances 134 | 135 | 136 | def build_memory_bank(args, dim_in, hidden_dim, dim_out): 137 | name = args.memory_bank_type 138 | memory_banks = { 139 | 'MemoryBank': MemoryBank, 140 | } 141 | assert name in memory_banks 142 | return memory_banks[name](args, dim_in, hidden_dim, dim_out) 143 | -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import os 13 | import math 14 | import torch 15 | import torch.distributed as dist 16 | from torch.utils.data.sampler import Sampler 17 | 18 | 19 | class DistributedSampler(Sampler): 20 | """Sampler that restricts data loading to a subset of the dataset. 21 | It is especially useful in conjunction with 22 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 23 | process can pass a DistributedSampler instance as a DataLoader sampler, 24 | and load a subset of the original dataset that is exclusive to it. 25 | .. note:: 26 | Dataset is assumed to be of constant size. 27 | Arguments: 28 | dataset: Dataset used for sampling. 29 | num_replicas (optional): Number of processes participating in 30 | distributed training. 31 | rank (optional): Rank of the current process within num_replicas. 32 | """ 33 | 34 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 35 | if num_replicas is None: 36 | if not dist.is_available(): 37 | raise RuntimeError("Requires distributed package to be available") 38 | num_replicas = dist.get_world_size() 39 | if rank is None: 40 | if not dist.is_available(): 41 | raise RuntimeError("Requires distributed package to be available") 42 | rank = dist.get_rank() 43 | self.dataset = dataset 44 | self.num_replicas = num_replicas 45 | self.rank = rank 46 | self.epoch = 0 47 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 48 | self.total_size = self.num_samples * self.num_replicas 49 | self.shuffle = shuffle 50 | 51 | def __iter__(self): 52 | if self.shuffle: 53 | # deterministically shuffle based on epoch 54 | g = torch.Generator() 55 | g.manual_seed(self.epoch) 56 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 57 | else: 58 | indices = torch.arange(len(self.dataset)).tolist() 59 | 60 | # add extra samples to make it evenly divisible 61 | if len(indices) * 2 < self.total_size: 62 | tmp = indices * self.total_size 63 | indices += tmp[: (self.total_size - len(indices))] 64 | else: 65 | indices += indices[: (self.total_size - len(indices))] 66 | assert len(indices) == self.total_size 67 | 68 | # subsample 69 | offset = self.num_samples * self.rank 70 | indices = indices[offset : offset + self.num_samples] 71 | assert len(indices) == self.num_samples 72 | 73 | return iter(indices) 74 | 75 | def __len__(self): 76 | return self.num_samples 77 | 78 | def set_epoch(self, epoch): 79 | self.epoch = epoch 80 | 81 | 82 | class NodeDistributedSampler(Sampler): 83 | """Sampler that restricts data loading to a subset of the dataset. 84 | It is especially useful in conjunction with 85 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 86 | process can pass a DistributedSampler instance as a DataLoader sampler, 87 | and load a subset of the original dataset that is exclusive to it. 88 | .. note:: 89 | Dataset is assumed to be of constant size. 90 | Arguments: 91 | dataset: Dataset used for sampling. 92 | num_replicas (optional): Number of processes participating in 93 | distributed training. 94 | rank (optional): Rank of the current process within num_replicas. 95 | """ 96 | 97 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 98 | if num_replicas is None: 99 | if not dist.is_available(): 100 | raise RuntimeError("Requires distributed package to be available") 101 | num_replicas = dist.get_world_size() 102 | if rank is None: 103 | if not dist.is_available(): 104 | raise RuntimeError("Requires distributed package to be available") 105 | rank = dist.get_rank() 106 | if local_rank is None: 107 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 108 | if local_size is None: 109 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 110 | self.dataset = dataset 111 | self.shuffle = shuffle 112 | self.num_replicas = num_replicas 113 | self.num_parts = local_size 114 | self.rank = rank 115 | self.local_rank = local_rank 116 | self.epoch = 0 117 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 118 | self.total_size = self.num_samples * self.num_replicas 119 | 120 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 121 | 122 | def __iter__(self): 123 | if self.shuffle: 124 | # deterministically shuffle based on epoch 125 | g = torch.Generator() 126 | g.manual_seed(self.epoch) 127 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 128 | else: 129 | indices = torch.arange(len(self.dataset)).tolist() 130 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 131 | 132 | # add extra samples to make it evenly divisible 133 | indices += indices[:(self.total_size_parts - len(indices))] 134 | assert len(indices) == self.total_size_parts 135 | 136 | # subsample 137 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 138 | assert len(indices) == self.num_samples 139 | 140 | return iter(indices) 141 | 142 | def __len__(self): 143 | return self.num_samples 144 | 145 | def set_epoch(self, epoch): 146 | self.epoch = epoch 147 | -------------------------------------------------------------------------------- /models/yolox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | from torch import nn, Tensor 7 | from typing import List 8 | 9 | from .yolo_head import YOLOXHead 10 | from .yolo_pafpn import YOLOPAFPN 11 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list, 12 | accuracy, get_world_size, interpolate, get_rank, 13 | is_dist_avail_and_initialized, inverse_sigmoid) 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisibility: int = 0): 25 | # TODO make this more general 26 | if tensor_list[0].ndim == 3: 27 | # TODO make it support different-sized images 28 | 29 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 30 | if size_divisibility > 0: 31 | stride = size_divisibility 32 | # the last two dims are H,W, both subject to divisibility requirement 33 | max_size[-1] = (max_size[-1] + (stride - 1)) // stride * stride 34 | max_size[-2] = (max_size[-2] + (stride - 1)) // stride * stride 35 | 36 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 37 | batch_shape = [len(tensor_list)] + max_size 38 | b, c, h, w = batch_shape 39 | dtype = tensor_list[0].dtype 40 | device = tensor_list[0].device 41 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 42 | for img, pad_img in zip(tensor_list, tensor): 43 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 44 | else: 45 | raise ValueError('not supported') 46 | return tensor 47 | 48 | 49 | 50 | class YOLOX(nn.Module): 51 | """ 52 | YOLOX model module. The module list is defined by create_yolov3_modules function. 53 | The network returns loss values from three YOLO layers during training 54 | and detection results during test. 55 | """ 56 | 57 | def __init__(self, backbone=None, head=None): 58 | super().__init__() 59 | if backbone is None: 60 | backbone = YOLOPAFPN() 61 | if head is None: 62 | head = YOLOXHead(80) 63 | 64 | self.backbone = backbone 65 | self.head = head 66 | 67 | def forward(self, x, targets=None): 68 | # fpn output content features of [dark3, dark4, dark5] 69 | fpn_outs = self.backbone(x) 70 | 71 | if self.training: 72 | assert targets is not None 73 | loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head( 74 | fpn_outs, targets, x 75 | ) 76 | outputs = { 77 | "total_loss": loss, 78 | "iou_loss": iou_loss, 79 | "l1_loss": l1_loss, 80 | "conf_loss": conf_loss, 81 | "cls_loss": cls_loss, 82 | "num_fg": num_fg, 83 | } 84 | else: 85 | outputs = self.head(fpn_outs) 86 | 87 | return outputs 88 | 89 | @torch.no_grad() 90 | def inference_single_image(self, img, ori_img_size, track_instances=None): 91 | if not isinstance(img, NestedTensor): 92 | img = nested_tensor_from_tensor_list(img, size_divisibility=32) 93 | output = self.forward(img) 94 | 95 | out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} 96 | 97 | # _, _, img_h, img_w = img.shape 98 | # scale = max(ori_img_size[0]/img_h, ori_img_size[1]/img_w) 99 | # output[..., :4] *= scale 100 | # output = output[output[..., 4]>0.5] 101 | 102 | # import cv2 103 | # res[..., [0,1]] -= res[..., [2,3]]/2 104 | # res[..., [2,3]] += res[..., [0,1]] 105 | # ori_img = ori_img.cpu().numpy() 106 | # for o in res.cpu().numpy(): 107 | # cv2.rectangle(ori_img, pt1 = (int(o[0]), int(0[1])), pt2 =(int(o[2]), int(0[3])), color = (0, 0, 255), thickness = 2) 108 | # cv2.imwrite('tmp.png', ori_img) 109 | return output 110 | 111 | 112 | class PostProcess(nn.Module): 113 | """ This module converts the model's output into the format expected by the coco api""" 114 | 115 | @torch.no_grad() 116 | def forward(self, outputs, target_sizes): 117 | """ Perform the computation 118 | Parameters: 119 | outputs: raw outputs of the model 120 | target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch 121 | For evaluation, this must be the original image size (before any data augmentation) 122 | For visualization, this should be the image size after data augment, but before padding 123 | """ 124 | out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] 125 | 126 | assert len(out_logits) == len(target_sizes) 127 | assert target_sizes.shape[1] == 2 128 | 129 | prob = out_logits.sigmoid() 130 | topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) 131 | scores = topk_values 132 | topk_boxes = topk_indexes // out_logits.shape[2] 133 | labels = topk_indexes % out_logits.shape[2] 134 | boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) 135 | boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4)) 136 | 137 | # and from relative [0, 1] to absolute [0, height] coordinates 138 | img_h, img_w = target_sizes.unbind(1) 139 | scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) 140 | boxes = boxes * scale_fct[:, None, :] 141 | 142 | results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] 143 | 144 | return results 145 | 146 | 147 | def build(args): 148 | 149 | def init_yolo(M): 150 | for m in M.modules(): 151 | if isinstance(m, nn.BatchNorm2d): 152 | m.eps = 1e-3 153 | m.momentum = 0.03 154 | 155 | in_channels = [256, 512, 1024] 156 | depth = 1.33 157 | width = 1.25 158 | num_classes = 1 159 | backbone = YOLOPAFPN(depth, width, in_channels=in_channels) 160 | head = YOLOXHead(num_classes, width, in_channels=in_channels) 161 | model = YOLOX(backbone, head) 162 | 163 | model.apply(init_yolo) 164 | model.head.initialize_biases(1e-2) 165 | 166 | return model, None, None -------------------------------------------------------------------------------- /models/dino/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Conditional DETR 7 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Copied from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | Various positional encodings for the transformer. 16 | """ 17 | import math 18 | import torch 19 | from torch import nn 20 | 21 | from util.misc import NestedTensor 22 | 23 | 24 | class PositionEmbeddingSine(nn.Module): 25 | """ 26 | This is a more standard version of the position embedding, very similar to the one 27 | used by the Attention is all you need paper, generalized to work on images. 28 | """ 29 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 30 | super().__init__() 31 | self.num_pos_feats = num_pos_feats 32 | self.temperature = temperature 33 | self.normalize = normalize 34 | if scale is not None and normalize is False: 35 | raise ValueError("normalize should be True if scale is passed") 36 | if scale is None: 37 | scale = 2 * math.pi 38 | self.scale = scale 39 | 40 | def forward(self, tensor_list: NestedTensor): 41 | x = tensor_list.tensors 42 | mask = tensor_list.mask 43 | assert mask is not None 44 | not_mask = ~mask 45 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 46 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 47 | if self.normalize: 48 | eps = 1e-6 49 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 50 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 51 | 52 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 53 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 54 | 55 | pos_x = x_embed[:, :, :, None] / dim_t 56 | pos_y = y_embed[:, :, :, None] / dim_t 57 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 58 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 59 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 60 | return pos 61 | 62 | class PositionEmbeddingSineHW(nn.Module): 63 | """ 64 | This is a more standard version of the position embedding, very similar to the one 65 | used by the Attention is all you need paper, generalized to work on images. 66 | """ 67 | def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None): 68 | super().__init__() 69 | self.num_pos_feats = num_pos_feats 70 | self.temperatureH = temperatureH 71 | self.temperatureW = temperatureW 72 | self.normalize = normalize 73 | if scale is not None and normalize is False: 74 | raise ValueError("normalize should be True if scale is passed") 75 | if scale is None: 76 | scale = 2 * math.pi 77 | self.scale = scale 78 | 79 | def forward(self, tensor_list: NestedTensor): 80 | x = tensor_list.tensors 81 | mask = tensor_list.mask 82 | assert mask is not None 83 | not_mask = ~mask 84 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 85 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 86 | 87 | 88 | 89 | if self.normalize: 90 | eps = 1e-6 91 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 92 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 93 | 94 | dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 95 | dim_tx = self.temperatureW ** (2 * torch.div(dim_tx, 2, rounding_mode="floor") / self.num_pos_feats) 96 | pos_x = x_embed[:, :, :, None] / dim_tx 97 | 98 | dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 99 | dim_ty = self.temperatureH ** (2 * torch.div(dim_ty, 2, rounding_mode="floor") / self.num_pos_feats) 100 | pos_y = y_embed[:, :, :, None] / dim_ty 101 | 102 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 103 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 104 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 105 | 106 | 107 | 108 | return pos 109 | 110 | class PositionEmbeddingLearned(nn.Module): 111 | """ 112 | Absolute pos embedding, learned. 113 | """ 114 | def __init__(self, num_pos_feats=256): 115 | super().__init__() 116 | self.row_embed = nn.Embedding(50, num_pos_feats) 117 | self.col_embed = nn.Embedding(50, num_pos_feats) 118 | self.reset_parameters() 119 | 120 | def reset_parameters(self): 121 | nn.init.uniform_(self.row_embed.weight) 122 | nn.init.uniform_(self.col_embed.weight) 123 | 124 | def forward(self, tensor_list: NestedTensor): 125 | x = tensor_list.tensors 126 | h, w = x.shape[-2:] 127 | i = torch.arange(w, device=x.device) 128 | j = torch.arange(h, device=x.device) 129 | x_emb = self.col_embed(i) 130 | y_emb = self.row_embed(j) 131 | pos = torch.cat([ 132 | x_emb.unsqueeze(0).repeat(h, 1, 1), 133 | y_emb.unsqueeze(1).repeat(1, w, 1), 134 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 135 | return pos 136 | 137 | 138 | def build_position_encoding(args): 139 | N_steps = args.hidden_dim // 2 140 | if args.position_embedding in ('v2', 'sine'): 141 | # TODO find a better way of exposing other arguments 142 | position_embedding = PositionEmbeddingSineHW( 143 | N_steps, 144 | temperatureH=args.pe_temperatureH, 145 | temperatureW=args.pe_temperatureW, 146 | normalize=True 147 | ) 148 | elif args.position_embedding in ('v3', 'learned'): 149 | position_embedding = PositionEmbeddingLearned(N_steps) 150 | else: 151 | raise ValueError(f"not supported {args.position_embedding}") 152 | 153 | return position_embedding 154 | -------------------------------------------------------------------------------- /models/darknet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from torch import nn 6 | 7 | from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck 8 | 9 | 10 | class Darknet(nn.Module): 11 | # number of blocks from dark2 to dark5. 12 | depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]} 13 | 14 | def __init__( 15 | self, 16 | depth, 17 | in_channels=3, 18 | stem_out_channels=32, 19 | out_features=("dark3", "dark4", "dark5"), 20 | ): 21 | """ 22 | Args: 23 | depth (int): depth of darknet used in model, usually use [21, 53] for this param. 24 | in_channels (int): number of input channels, for example, use 3 for RGB image. 25 | stem_out_channels (int): number of output chanels of darknet stem. 26 | It decides channels of darknet layer2 to layer5. 27 | out_features (Tuple[str]): desired output layer name. 28 | """ 29 | super().__init__() 30 | assert out_features, "please provide output features of Darknet" 31 | self.out_features = out_features 32 | self.stem = nn.Sequential( 33 | BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"), 34 | *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2), 35 | ) 36 | in_channels = stem_out_channels * 2 # 64 37 | 38 | num_blocks = Darknet.depth2blocks[depth] 39 | # create darknet with `stem_out_channels` and `num_blocks` layers. 40 | # to make model structure more clear, we don't use `for` statement in python. 41 | self.dark2 = nn.Sequential( 42 | *self.make_group_layer(in_channels, num_blocks[0], stride=2) 43 | ) 44 | in_channels *= 2 # 128 45 | self.dark3 = nn.Sequential( 46 | *self.make_group_layer(in_channels, num_blocks[1], stride=2) 47 | ) 48 | in_channels *= 2 # 256 49 | self.dark4 = nn.Sequential( 50 | *self.make_group_layer(in_channels, num_blocks[2], stride=2) 51 | ) 52 | in_channels *= 2 # 512 53 | 54 | self.dark5 = nn.Sequential( 55 | *self.make_group_layer(in_channels, num_blocks[3], stride=2), 56 | *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2), 57 | ) 58 | 59 | def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1): 60 | "starts with conv layer then has `num_blocks` `ResLayer`" 61 | return [ 62 | BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"), 63 | *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)], 64 | ] 65 | 66 | def make_spp_block(self, filters_list, in_filters): 67 | m = nn.Sequential( 68 | *[ 69 | BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"), 70 | BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), 71 | SPPBottleneck( 72 | in_channels=filters_list[1], 73 | out_channels=filters_list[0], 74 | activation="lrelu", 75 | ), 76 | BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), 77 | BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"), 78 | ] 79 | ) 80 | return m 81 | 82 | def forward(self, x): 83 | outputs = {} 84 | x = self.stem(x) 85 | outputs["stem"] = x 86 | x = self.dark2(x) 87 | outputs["dark2"] = x 88 | x = self.dark3(x) 89 | outputs["dark3"] = x 90 | x = self.dark4(x) 91 | outputs["dark4"] = x 92 | x = self.dark5(x) 93 | outputs["dark5"] = x 94 | return {k: v for k, v in outputs.items() if k in self.out_features} 95 | 96 | 97 | class CSPDarknet(nn.Module): 98 | def __init__( 99 | self, 100 | dep_mul, 101 | wid_mul, 102 | out_features=("dark3", "dark4", "dark5"), 103 | depthwise=False, 104 | act="silu", 105 | ): 106 | super().__init__() 107 | assert out_features, "please provide output features of Darknet" 108 | self.out_features = out_features 109 | Conv = DWConv if depthwise else BaseConv 110 | 111 | base_channels = int(wid_mul * 64) # 64 112 | base_depth = max(round(dep_mul * 3), 1) # 3 113 | 114 | # stem 115 | self.stem = Focus(3, base_channels, ksize=3, act=act) 116 | 117 | # dark2 118 | self.dark2 = nn.Sequential( 119 | Conv(base_channels, base_channels * 2, 3, 2, act=act), 120 | CSPLayer( 121 | base_channels * 2, 122 | base_channels * 2, 123 | n=base_depth, 124 | depthwise=depthwise, 125 | act=act, 126 | ), 127 | ) 128 | 129 | # dark3 130 | self.dark3 = nn.Sequential( 131 | Conv(base_channels * 2, base_channels * 4, 3, 2, act=act), 132 | CSPLayer( 133 | base_channels * 4, 134 | base_channels * 4, 135 | n=base_depth * 3, 136 | depthwise=depthwise, 137 | act=act, 138 | ), 139 | ) 140 | 141 | # dark4 142 | self.dark4 = nn.Sequential( 143 | Conv(base_channels * 4, base_channels * 8, 3, 2, act=act), 144 | CSPLayer( 145 | base_channels * 8, 146 | base_channels * 8, 147 | n=base_depth * 3, 148 | depthwise=depthwise, 149 | act=act, 150 | ), 151 | ) 152 | 153 | # dark5 154 | self.dark5 = nn.Sequential( 155 | Conv(base_channels * 8, base_channels * 16, 3, 2, act=act), 156 | SPPBottleneck(base_channels * 16, base_channels * 16, activation=act), 157 | CSPLayer( 158 | base_channels * 16, 159 | base_channels * 16, 160 | n=base_depth, 161 | shortcut=False, 162 | depthwise=depthwise, 163 | act=act, 164 | ), 165 | ) 166 | 167 | def forward(self, x): 168 | outputs = {} 169 | x = self.stem(x) 170 | outputs["stem"] = x 171 | x = self.dark2(x) 172 | outputs["dark2"] = x 173 | x = self.dark3(x) 174 | outputs["dark3"] = x 175 | x = self.dark4(x) 176 | outputs["dark4"] = x 177 | x = self.dark5(x) 178 | outputs["dark5"] = x 179 | return {k: v for k, v in outputs.items() if k in self.out_features} 180 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Plotting utilities to visualize training logs. 14 | """ 15 | import cv2 16 | import torch 17 | import pandas as pd 18 | import numpy as np 19 | import seaborn as sns 20 | import matplotlib.pyplot as plt 21 | 22 | from torch import Tensor 23 | 24 | from pathlib import Path, PurePath 25 | 26 | 27 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 28 | ''' 29 | Function to plot specific fields from training log(s). Plots both training and test results. 30 | 31 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 32 | - fields = which results to plot from each log file - plots both training and test for each field. 33 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 34 | - log_name = optional, name of log file if different than default 'log.txt'. 35 | 36 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 37 | - solid lines are training results, dashed lines are test results. 38 | 39 | ''' 40 | func_name = "plot_utils.py::plot_logs" 41 | 42 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 43 | # convert single Path to list to avoid 'not iterable' error 44 | 45 | if not isinstance(logs, list): 46 | if isinstance(logs, PurePath): 47 | logs = [logs] 48 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 49 | else: 50 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 51 | Expect list[Path] or single Path obj, received {type(logs)}") 52 | 53 | # verify valid dir(s) and that every item in list is Path object 54 | for i, dir in enumerate(logs): 55 | if not isinstance(dir, PurePath): 56 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 57 | if dir.exists(): 58 | continue 59 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 60 | 61 | # load log file(s) and plot 62 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 63 | 64 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 65 | 66 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 67 | for j, field in enumerate(fields): 68 | if field == 'mAP': 69 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 70 | axs[j].plot(coco_eval, c=color) 71 | else: 72 | df.interpolate().ewm(com=ewm_col).mean().plot( 73 | y=[f'train_{field}', f'test_{field}'], 74 | ax=axs[j], 75 | color=[color] * 2, 76 | style=['-', '--'] 77 | ) 78 | for ax, field in zip(axs, fields): 79 | ax.legend([Path(p).name for p in logs]) 80 | ax.set_title(field) 81 | 82 | 83 | def plot_precision_recall(files, naming_scheme='iter'): 84 | if naming_scheme == 'exp_id': 85 | # name becomes exp_id 86 | names = [f.parts[-3] for f in files] 87 | elif naming_scheme == 'iter': 88 | names = [f.stem for f in files] 89 | else: 90 | raise ValueError(f'not supported {naming_scheme}') 91 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 92 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 93 | data = torch.load(f) 94 | # precision is n_iou, n_points, n_cat, n_area, max_det 95 | precision = data['precision'] 96 | recall = data['params'].recThrs 97 | scores = data['scores'] 98 | # take precision for all classes, all areas and 100 detections 99 | precision = precision[0, :, :, 0, -1].mean(1) 100 | scores = scores[0, :, :, 0, -1].mean(1) 101 | prec = precision.mean() 102 | rec = data['recall'][0, :, 0, -1].mean() 103 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 104 | f'score={scores.mean():0.3f}, ' + 105 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 106 | ) 107 | axs[0].plot(recall, precision, c=color) 108 | axs[1].plot(recall, scores, c=color) 109 | 110 | axs[0].set_title('Precision / Recall') 111 | axs[0].legend(names) 112 | axs[1].set_title('Scores / Recall') 113 | axs[1].legend(names) 114 | return fig, axs 115 | 116 | 117 | def draw_boxes(image: Tensor, boxes: Tensor, color=(0, 255, 0), texts=None) -> np.ndarray: 118 | if isinstance(image, Tensor): 119 | cv_image = image.detach().cpu().numpy() 120 | else: 121 | cv_image = image 122 | if isinstance(boxes, Tensor): 123 | cv_boxes = boxes.detach().cpu().numpy() 124 | else: 125 | cv_boxes = boxes 126 | 127 | tl = round(0.002 * max(image.shape[0:2])) + 1 # line thickness 128 | tf = max(tl - 1, 1) 129 | for i in range(len(boxes)): 130 | box = cv_boxes[i] 131 | x1, y1 = box[0:2] 132 | x2, y2 = box[2:4] 133 | cv2.rectangle(cv_image, (int(x1), int(y1)), (int(x2), int(y2)), color=color) 134 | if texts is not None: 135 | cv2.putText(cv_image, texts[i], (int(x1), int(y1+10)), 0, tl/3, [225, 255, 255], 136 | thickness=tf, 137 | lineType=cv2.LINE_AA) 138 | return cv_image 139 | 140 | 141 | def draw_ref_pts(image: Tensor, ref_pts: Tensor) -> np.ndarray: 142 | if isinstance(image, Tensor): 143 | cv_image = image.detach().cpu().numpy() 144 | else: 145 | cv_image = image 146 | if isinstance(ref_pts, Tensor): 147 | cv_pts = ref_pts.detach().cpu().numpy() 148 | else: 149 | cv_pts = ref_pts 150 | for i in range(len(cv_pts)): 151 | x, y, is_pos = cv_pts[i] 152 | color = (0, 1, 0) if is_pos else (1, 1, 1) 153 | cv2.circle(cv_image, (int(x), int(y)), 2, color) 154 | return cv_image 155 | 156 | 157 | def image_hwc2chw(image: np.ndarray): 158 | image = np.ascontiguousarray(image.transpose(2, 0, 1)) 159 | return image 160 | -------------------------------------------------------------------------------- /models/network_blocks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | class SiLU(nn.Module): 10 | """export-friendly version of nn.SiLU()""" 11 | 12 | @staticmethod 13 | def forward(x): 14 | return x * torch.sigmoid(x) 15 | 16 | 17 | def get_activation(name="silu", inplace=True): 18 | if name == "silu": 19 | # module = nn.SiLU(inplace=inplace) 20 | module = SiLU() 21 | elif name == "relu": 22 | module = nn.ReLU(inplace=inplace) 23 | elif name == "lrelu": 24 | module = nn.LeakyReLU(0.1, inplace=inplace) 25 | else: 26 | raise AttributeError("Unsupported act type: {}".format(name)) 27 | return module 28 | 29 | 30 | class BaseConv(nn.Module): 31 | """A Conv2d -> Batchnorm -> silu/leaky relu block""" 32 | 33 | def __init__( 34 | self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu" 35 | ): 36 | super().__init__() 37 | # same padding 38 | pad = (ksize - 1) // 2 39 | self.conv = nn.Conv2d( 40 | in_channels, 41 | out_channels, 42 | kernel_size=ksize, 43 | stride=stride, 44 | padding=pad, 45 | groups=groups, 46 | bias=bias, 47 | ) 48 | self.bn = nn.BatchNorm2d(out_channels) 49 | self.act = get_activation(act, inplace=True) 50 | 51 | def forward(self, x): 52 | return self.act(self.bn(self.conv(x))) 53 | 54 | def fuseforward(self, x): 55 | return self.act(self.conv(x)) 56 | 57 | 58 | class DWConv(nn.Module): 59 | """Depthwise Conv + Conv""" 60 | 61 | def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): 62 | super().__init__() 63 | self.dconv = BaseConv( 64 | in_channels, 65 | in_channels, 66 | ksize=ksize, 67 | stride=stride, 68 | groups=in_channels, 69 | act=act, 70 | ) 71 | self.pconv = BaseConv( 72 | in_channels, out_channels, ksize=1, stride=1, groups=1, act=act 73 | ) 74 | 75 | def forward(self, x): 76 | x = self.dconv(x) 77 | return self.pconv(x) 78 | 79 | 80 | class Bottleneck(nn.Module): 81 | # Standard bottleneck 82 | def __init__( 83 | self, 84 | in_channels, 85 | out_channels, 86 | shortcut=True, 87 | expansion=0.5, 88 | depthwise=False, 89 | act="silu", 90 | ): 91 | super().__init__() 92 | hidden_channels = int(out_channels * expansion) 93 | Conv = DWConv if depthwise else BaseConv 94 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) 95 | self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act) 96 | self.use_add = shortcut and in_channels == out_channels 97 | 98 | def forward(self, x): 99 | y = self.conv2(self.conv1(x)) 100 | if self.use_add: 101 | y = y + x 102 | return y 103 | 104 | 105 | class ResLayer(nn.Module): 106 | "Residual layer with `in_channels` inputs." 107 | 108 | def __init__(self, in_channels: int): 109 | super().__init__() 110 | mid_channels = in_channels // 2 111 | self.layer1 = BaseConv( 112 | in_channels, mid_channels, ksize=1, stride=1, act="lrelu" 113 | ) 114 | self.layer2 = BaseConv( 115 | mid_channels, in_channels, ksize=3, stride=1, act="lrelu" 116 | ) 117 | 118 | def forward(self, x): 119 | out = self.layer2(self.layer1(x)) 120 | return x + out 121 | 122 | 123 | class SPPBottleneck(nn.Module): 124 | """Spatial pyramid pooling layer used in YOLOv3-SPP""" 125 | 126 | def __init__( 127 | self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu" 128 | ): 129 | super().__init__() 130 | hidden_channels = in_channels // 2 131 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) 132 | self.m = nn.ModuleList( 133 | [ 134 | nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) 135 | for ks in kernel_sizes 136 | ] 137 | ) 138 | conv2_channels = hidden_channels * (len(kernel_sizes) + 1) 139 | self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) 140 | 141 | def forward(self, x): 142 | x = self.conv1(x) 143 | x = torch.cat([x] + [m(x) for m in self.m], dim=1) 144 | x = self.conv2(x) 145 | return x 146 | 147 | 148 | class CSPLayer(nn.Module): 149 | """C3 in yolov5, CSP Bottleneck with 3 convolutions""" 150 | 151 | def __init__( 152 | self, 153 | in_channels, 154 | out_channels, 155 | n=1, 156 | shortcut=True, 157 | expansion=0.5, 158 | depthwise=False, 159 | act="silu", 160 | ): 161 | """ 162 | Args: 163 | in_channels (int): input channels. 164 | out_channels (int): output channels. 165 | n (int): number of Bottlenecks. Default value: 1. 166 | """ 167 | # ch_in, ch_out, number, shortcut, groups, expansion 168 | super().__init__() 169 | hidden_channels = int(out_channels * expansion) # hidden channels 170 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) 171 | self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act) 172 | self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act) 173 | module_list = [ 174 | Bottleneck( 175 | hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act 176 | ) 177 | for _ in range(n) 178 | ] 179 | self.m = nn.Sequential(*module_list) 180 | 181 | def forward(self, x): 182 | x_1 = self.conv1(x) 183 | x_2 = self.conv2(x) 184 | x_1 = self.m(x_1) 185 | x = torch.cat((x_1, x_2), dim=1) 186 | return self.conv3(x) 187 | 188 | 189 | class Focus(nn.Module): 190 | """Focus width and height information into channel space.""" 191 | 192 | def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"): 193 | super().__init__() 194 | self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act) 195 | 196 | def forward(self, x): 197 | # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) 198 | patch_top_left = x[..., ::2, ::2] 199 | patch_top_right = x[..., ::2, 1::2] 200 | patch_bot_left = x[..., 1::2, ::2] 201 | patch_bot_right = x[..., 1::2, 1::2] 202 | x = torch.cat( 203 | ( 204 | patch_top_left, 205 | patch_bot_left, 206 | patch_top_right, 207 | patch_bot_right, 208 | ), 209 | dim=1, 210 | ) 211 | return self.conv(x) 212 | -------------------------------------------------------------------------------- /models/dino/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import warnings 14 | import math 15 | 16 | import torch 17 | from torch import nn 18 | import torch.nn.functional as F 19 | from torch.nn.init import xavier_uniform_, constant_ 20 | 21 | from ..functions import MSDeformAttnFunction 22 | 23 | 24 | def _is_power_of_2(n): 25 | if (not isinstance(n, int)) or (n < 0): 26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 27 | return (n & (n-1) == 0) and n != 0 28 | 29 | 30 | class MSDeformAttn(nn.Module): 31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 32 | """ 33 | Multi-Scale Deformable Attention Module 34 | :param d_model hidden dimension 35 | :param n_levels number of feature levels 36 | :param n_heads number of attention heads 37 | :param n_points number of sampling points per attention head per feature level 38 | """ 39 | super().__init__() 40 | if d_model % n_heads != 0: 41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 42 | _d_per_head = d_model // n_heads 43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 44 | if not _is_power_of_2(_d_per_head): 45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 46 | "which is more efficient in our CUDA implementation.") 47 | 48 | self.im2col_step = 64 49 | 50 | self.d_model = d_model 51 | self.n_levels = n_levels 52 | self.n_heads = n_heads 53 | self.n_points = n_points 54 | 55 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 56 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 57 | self.value_proj = nn.Linear(d_model, d_model) 58 | self.output_proj = nn.Linear(d_model, d_model) 59 | 60 | self._reset_parameters() 61 | 62 | def _reset_parameters(self): 63 | constant_(self.sampling_offsets.weight.data, 0.) 64 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 65 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 66 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 67 | for i in range(self.n_points): 68 | grid_init[:, :, i, :] *= i + 1 69 | with torch.no_grad(): 70 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 71 | constant_(self.attention_weights.weight.data, 0.) 72 | constant_(self.attention_weights.bias.data, 0.) 73 | xavier_uniform_(self.value_proj.weight.data) 74 | constant_(self.value_proj.bias.data, 0.) 75 | xavier_uniform_(self.output_proj.weight.data) 76 | constant_(self.output_proj.bias.data, 0.) 77 | 78 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 79 | """ 80 | :param query (N, Length_{query}, C) 81 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 82 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 83 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 84 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 85 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 86 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 87 | 88 | :return output (N, Length_{query}, C) 89 | """ 90 | N, Len_q, _ = query.shape 91 | N, Len_in, _ = input_flatten.shape 92 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 93 | 94 | value = self.value_proj(input_flatten) 95 | if input_padding_mask is not None: 96 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 97 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 98 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 99 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 100 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 101 | # N, Len_q, n_heads, n_levels, n_points, 2 102 | if reference_points.shape[-1] == 2: 103 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 104 | sampling_locations = reference_points[:, :, None, :, None, :] \ 105 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 106 | elif reference_points.shape[-1] == 4: 107 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 108 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 109 | else: 110 | raise ValueError( 111 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 112 | 113 | # for amp 114 | if value.dtype == torch.float16: 115 | # for mixed precision 116 | output = MSDeformAttnFunction.apply( 117 | value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step) 118 | output = output.to(torch.float16) 119 | output = self.output_proj(output) 120 | return output 121 | 122 | 123 | output = MSDeformAttnFunction.apply( 124 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 125 | output = self.output_proj(output) 126 | return output 127 | -------------------------------------------------------------------------------- /models/dino/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # DINO 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | from torch import nn, Tensor 9 | 10 | import math 11 | import torch.nn.functional as F 12 | from torch import nn 13 | 14 | 15 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor, learnedwh=None): 16 | """ 17 | Input: 18 | - memory: bs, \sum{hw}, d_model 19 | - memory_padding_mask: bs, \sum{hw} 20 | - spatial_shapes: nlevel, 2 21 | - learnedwh: 2 22 | Output: 23 | - output_memory: bs, \sum{hw}, d_model 24 | - output_proposals: bs, \sum{hw}, 4 25 | """ 26 | N_, S_, C_ = memory.shape 27 | base_scale = 4.0 28 | proposals = [] 29 | _cur = 0 30 | for lvl, (H_, W_) in enumerate(spatial_shapes): 31 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1) 32 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) 33 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) 34 | 35 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), 36 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) 37 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 38 | 39 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) 40 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale 41 | 42 | if learnedwh is not None: 43 | wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl) 44 | else: 45 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl) 46 | 47 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) 48 | proposals.append(proposal) 49 | _cur += (H_ * W_) 50 | 51 | output_proposals = torch.cat(proposals, 1) 52 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) 53 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid 54 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) 55 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) 56 | 57 | output_memory = memory 58 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) 59 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) 60 | 61 | return output_memory, output_proposals 62 | 63 | 64 | class RandomBoxPerturber(): 65 | def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None: 66 | self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]) 67 | 68 | def __call__(self, refanchors: Tensor) -> Tensor: 69 | nq, bs, query_dim = refanchors.shape 70 | device = refanchors.device 71 | 72 | noise_raw = torch.rand_like(refanchors) 73 | noise_scale = self.noise_scale.to(device)[:query_dim] 74 | 75 | new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) 76 | return new_refanchors.clamp_(0, 1) 77 | 78 | 79 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 80 | """ 81 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 82 | Args: 83 | inputs: A float tensor of arbitrary shape. 84 | The predictions for each example. 85 | targets: A float tensor with the same shape as inputs. Stores the binary 86 | classification label for each element in inputs 87 | (0 for the negative class and 1 for the positive class). 88 | alpha: (optional) Weighting factor in range (0,1) to balance 89 | positive vs negative examples. Default = -1 (no weighting). 90 | gamma: Exponent of the modulating factor (1 - p_t) to 91 | balance easy vs hard examples. 92 | Returns: 93 | Loss tensor 94 | """ 95 | prob = inputs.sigmoid() 96 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 97 | p_t = prob * targets + (1 - prob) * (1 - targets) 98 | loss = ce_loss * ((1 - p_t) ** gamma) 99 | 100 | if alpha >= 0: 101 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 102 | loss = alpha_t * loss 103 | 104 | return loss.mean(1).sum() / num_boxes 105 | 106 | 107 | class MLP(nn.Module): 108 | """ Very simple multi-layer perceptron (also called FFN)""" 109 | 110 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 111 | super().__init__() 112 | self.num_layers = num_layers 113 | h = [hidden_dim] * (num_layers - 1) 114 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 115 | 116 | def forward(self, x): 117 | for i, layer in enumerate(self.layers): 118 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 119 | return x 120 | 121 | 122 | def _get_activation_fn(activation, d_model=256, batch_dim=0): 123 | """Return an activation function given a string""" 124 | if activation == "relu": 125 | return F.relu 126 | if activation == "gelu": 127 | return F.gelu 128 | if activation == "glu": 129 | return F.glu 130 | if activation == "prelu": 131 | return nn.PReLU() 132 | if activation == "selu": 133 | return F.selu 134 | 135 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 136 | 137 | 138 | def gen_sineembed_for_position(pos_tensor): 139 | # n_query, bs, _ = pos_tensor.size() 140 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 141 | scale = 2 * math.pi 142 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) 143 | dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128) 144 | x_embed = pos_tensor[:, :, 0] * scale 145 | y_embed = pos_tensor[:, :, 1] * scale 146 | pos_x = x_embed[:, :, None] / dim_t 147 | pos_y = y_embed[:, :, None] / dim_t 148 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 149 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) 150 | if pos_tensor.size(-1) == 2: 151 | pos = torch.cat((pos_y, pos_x), dim=2) 152 | elif pos_tensor.size(-1) == 4: 153 | w_embed = pos_tensor[:, :, 2] * scale 154 | pos_w = w_embed[:, :, None] / dim_t 155 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 156 | 157 | h_embed = pos_tensor[:, :, 3] * scale 158 | pos_h = h_embed[:, :, None] / dim_t 159 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) 160 | 161 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) 162 | else: 163 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) 164 | return pos -------------------------------------------------------------------------------- /models/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch 25 | features_grad=0.0 26 | 27 | def _is_power_of_2(n): 28 | if (not isinstance(n, int)) or (n < 0): 29 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 30 | return (n & (n-1) == 0) and n != 0 31 | 32 | 33 | class MSDeformAttn(nn.Module): 34 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64): 35 | """ 36 | Multi-Scale Deformable Attention Module 37 | :param d_model hidden dimension 38 | :param n_levels number of feature levels 39 | :param n_heads number of attention heads 40 | :param n_points number of sampling points per attention head per feature level 41 | """ 42 | super().__init__() 43 | if d_model % n_heads != 0: 44 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 45 | _d_per_head = d_model // n_heads 46 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 47 | if not _is_power_of_2(_d_per_head): 48 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 49 | "which is more efficient in our CUDA implementation.") 50 | 51 | self.im2col_step = im2col_step 52 | self.sigmoid_attn = sigmoid_attn 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value.masked_fill_(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | if self.sigmoid_attn: 105 | attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 106 | else: 107 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 108 | # N, Len_q, n_heads, n_levels, n_points, 2 109 | if reference_points.shape[-1] == 2: 110 | sampling_locations = reference_points[:, :, None, :, None, :] \ 111 | + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)] 112 | elif reference_points.shape[-1] == 4: 113 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 114 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 115 | else: 116 | raise ValueError( 117 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 118 | 119 | # def extract(g): 120 | # global features_grad 121 | # features_grad = g 122 | # value.requires_grad=True 123 | # value.register_hook(extract) 124 | 125 | output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 126 | # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float() 127 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 128 | 129 | output = self.output_proj(output) 130 | return output 131 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-3.8/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch 25 | features_grad=0.0 26 | 27 | def _is_power_of_2(n): 28 | if (not isinstance(n, int)) or (n < 0): 29 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 30 | return (n & (n-1) == 0) and n != 0 31 | 32 | 33 | class MSDeformAttn(nn.Module): 34 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64): 35 | """ 36 | Multi-Scale Deformable Attention Module 37 | :param d_model hidden dimension 38 | :param n_levels number of feature levels 39 | :param n_heads number of attention heads 40 | :param n_points number of sampling points per attention head per feature level 41 | """ 42 | super().__init__() 43 | if d_model % n_heads != 0: 44 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 45 | _d_per_head = d_model // n_heads 46 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 47 | if not _is_power_of_2(_d_per_head): 48 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 49 | "which is more efficient in our CUDA implementation.") 50 | 51 | self.im2col_step = im2col_step 52 | self.sigmoid_attn = sigmoid_attn 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value.masked_fill_(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | if self.sigmoid_attn: 105 | attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 106 | else: 107 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 108 | # N, Len_q, n_heads, n_levels, n_points, 2 109 | if reference_points.shape[-1] == 2: 110 | sampling_locations = reference_points[:, :, None, :, None, :] \ 111 | + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)] 112 | elif reference_points.shape[-1] == 4: 113 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 114 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 115 | else: 116 | raise ValueError( 117 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 118 | 119 | # def extract(g): 120 | # global features_grad 121 | # features_grad = g 122 | # value.requires_grad=True 123 | # value.register_hook(extract) 124 | 125 | output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 126 | # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float() 127 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 128 | 129 | output = self.output_proj(output) 130 | return output 131 | --------------------------------------------------------------------------------