├── requirements.txt
├── .gitignore
├── models
├── ops
│ ├── MultiScaleDeformableAttention.egg-info
│ │ ├── dependency_links.txt
│ │ ├── top_level.txt
│ │ ├── PKG-INFO
│ │ └── SOURCES.txt
│ ├── build
│ │ ├── temp.linux-x86_64-cpython-37
│ │ │ ├── .ninja_deps
│ │ │ ├── mnt
│ │ │ │ └── dolphinfs
│ │ │ │ │ └── hdd_pool
│ │ │ │ │ └── docker
│ │ │ │ │ └── user
│ │ │ │ │ └── hadoop-vacv
│ │ │ │ │ └── yanfeng
│ │ │ │ │ └── project
│ │ │ │ │ └── MOTRv2
│ │ │ │ │ └── MOTRv3
│ │ │ │ │ └── models
│ │ │ │ │ └── ops
│ │ │ │ │ └── src
│ │ │ │ │ ├── vision.o
│ │ │ │ │ ├── cpu
│ │ │ │ │ └── ms_deform_attn_cpu.o
│ │ │ │ │ └── cuda
│ │ │ │ │ └── ms_deform_attn_cuda.o
│ │ │ ├── .ninja_log
│ │ │ └── build.ninja
│ │ ├── lib.linux-x86_64-3.8
│ │ │ ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
│ │ │ ├── modules
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn.py
│ │ │ └── functions
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn_func.py
│ │ ├── lib.linux-x86_64-cpython-37
│ │ │ ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
│ │ │ ├── modules
│ │ │ │ └── __init__.py
│ │ │ └── functions
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn_func.py
│ │ └── temp.linux-x86_64-3.8
│ │ │ └── mnt
│ │ │ └── dolphinfs
│ │ │ └── hdd_pool
│ │ │ └── docker
│ │ │ └── user
│ │ │ └── hadoop-vacv
│ │ │ └── yanfeng
│ │ │ └── project
│ │ │ └── MOTRv2
│ │ │ └── CO-MOT
│ │ │ └── models
│ │ │ └── ops
│ │ │ └── src
│ │ │ ├── vision.o
│ │ │ ├── cpu
│ │ │ └── ms_deform_attn_cpu.o
│ │ │ └── cuda
│ │ │ └── ms_deform_attn_cuda.o
│ ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
│ ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
│ ├── dist
│ │ └── MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg
│ ├── make.sh
│ ├── modules
│ │ ├── __init__.py
│ │ └── ms_deform_attn.py
│ ├── functions
│ │ ├── __init__.py
│ │ └── ms_deform_attn_func.py
│ ├── src
│ │ ├── vision.cpp
│ │ ├── cuda
│ │ │ └── ms_deform_attn_cuda.h
│ │ ├── cpu
│ │ │ ├── ms_deform_attn_cpu.h
│ │ │ └── ms_deform_attn_cpu.cpp
│ │ └── ms_deform_attn.h
│ ├── setup.py
│ └── test.py
├── structures
│ └── __init__.py
├── dino
│ ├── __init__.py
│ ├── ops
│ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn.py
│ │ ├── functions
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn_func.py
│ │ ├── make.sh
│ │ ├── src
│ │ │ ├── vision.cpp
│ │ │ ├── cuda
│ │ │ │ └── ms_deform_attn_cuda.h
│ │ │ ├── cpu
│ │ │ │ ├── ms_deform_attn_cpu.h
│ │ │ │ └── ms_deform_attn_cpu.cpp
│ │ │ └── ms_deform_attn.h
│ │ ├── setup.py
│ │ └── test.py
│ ├── position_encoding.py
│ └── utils.py
├── registry.py
├── __init__.py
├── yolo_fpn.py
├── losses.py
├── yolo_pafpn.py
├── position_encoding.py
├── memory_bank.py
├── yolox.py
├── darknet.py
└── network_blocks.py
├── tools
├── show_user_using_nvidia.sh
├── copy_back.sh
├── debug.sh
├── simple_inference.sh
├── simplebdd_inference.sh
├── simplemot_inference.sh
├── batch_diff.py
├── merge_dance_tracklets.sh
├── resume.sh
├── run_dist_launch.sh
├── eval_dance.sh
├── run_dist_slurm.sh
├── make_detdb.py
├── coco_evel.py
├── train.sh
├── clip_train.py
├── merge_dance_tracklets.py
├── train_ddp.sh
├── similarity_analysis.py
└── visualize_tao.py
├── datasets
├── alignment.txt
├── __init__.py
├── panoptic_eval.py
├── data_prefetcher.py
└── samplers.py
├── util
├── json_parser.py
├── __init__.py
├── checkpoint.py
├── box_ops.py
├── tool.py
└── plot_utils.py
├── configs
└── motrv2ch_uni5cost3ggoon.args
└── .vscode
└── launch.json
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | scipy
3 | opencv-python
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tracker*/
2 | exps
3 | __pycache__
4 | tmp
5 | checkpoints
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 |
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg
--------------------------------------------------------------------------------
/tools/show_user_using_nvidia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | pids=$(fuser -v /dev/nvidia* | cut -d' ' -f3- | tr ' ' '\n' | sort -u)
4 | for pid in $pids
5 | do
6 | echo "PID: $pid CWD: $(readlink /proc/$pid/cwd)"
7 | done
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/datasets/alignment.txt:
--------------------------------------------------------------------------------
1 |
Describe this image in detail.
2 |
Take a look at this image and describe what you notice.
3 |
Please provide a detailed description of the picture.
4 |
Could you describe the contents of this image for me?
--------------------------------------------------------------------------------
/util/json_parser.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | def parse(str, key):
5 | str_dict = json.loads(str)
6 | val = str_dict[key]
7 | if type(val)==list:
8 | return ",".join(val)
9 | else:
10 | return val
11 |
12 | if __name__ == '__main__':
13 | parse(sys.argv[1], sys.argv[2])
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o
--------------------------------------------------------------------------------
/tools/copy_back.sh:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 |
6 | set -x
7 |
8 | cp $1/*.py .
9 | cp $1/models/*.py models
10 | cp $1/datasets/*.py datasets
11 |
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: MultiScaleDeformableAttention
3 | Version: 1.0
4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
6 | Author: Weijie Su
7 | License: UNKNOWN
8 | Platform: UNKNOWN
9 |
10 | UNKNOWN
11 |
12 |
--------------------------------------------------------------------------------
/tools/debug.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 |
9 | args=$(cat $1)
10 |
11 | export CUDA_LAUNCH_BLOCKING=1
12 | python main.py ${args} --output_dir /tmp/clip_mot_v2
13 |
--------------------------------------------------------------------------------
/tools/simple_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 | set -o pipefail
9 |
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_dance.py ${args} --exp_name tracker --resume $2 $3
13 |
--------------------------------------------------------------------------------
/tools/simplebdd_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 | set -o pipefail
9 |
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_bdd.py ${args} --exp_name tracker --resume $2 $3
13 |
--------------------------------------------------------------------------------
/tools/simplemot_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 | set -o pipefail
9 |
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_mot.py ${args} --exp_name tracker --resume $2 $3
13 |
--------------------------------------------------------------------------------
/models/structures/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Detectron2 (https://github.com/facebookresearch/detectron2)
3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
4 | # ------------------------------------------------------------------------
5 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, matched_boxlist_iou
6 | from .instances import Instances
7 |
8 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
--------------------------------------------------------------------------------
/configs/motrv2ch_uni5cost3ggoon.args:
--------------------------------------------------------------------------------
1 | --meta_arch motr_unincost
2 | --dataset_file e2e_dance
3 | --epoch 20
4 | --with_box_refine
5 | --lr_drop 8
6 | --lr 2e-4
7 | --lr_backbone 2e-5
8 | --pretrained xx/checkpoint0019.pth
9 | --batch_size 1
10 | --sample_mode random_interval
11 | --sample_interval 10
12 | --sampler_lengths 5
13 | --merger_dropout 0
14 | --dropout 0
15 | --random_drop 0.1
16 | --fp_ratio 0.3
17 | --query_interaction_layer GQIM
18 | --num_queries 60
19 | --append_crowd
20 | --use_checkpoint
21 | --mot_path xxx/data/
22 | --match_type gmatch
23 | --g_size 3
24 |
--------------------------------------------------------------------------------
/models/dino/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Conditional DETR
3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Copied from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
8 | # ------------------------------------------------------------------------
9 |
10 | from .dino import build_dino
11 |
--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | python setup.py build install
10 |
--------------------------------------------------------------------------------
/models/dino/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn import MSDeformAttn
10 |
--------------------------------------------------------------------------------
/models/dino/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 |
11 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
--------------------------------------------------------------------------------
/tools/batch_diff.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 |
6 | import argparse
7 | from glob import glob
8 | from subprocess import run
9 |
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('src')
13 | parser.add_argument('dst')
14 | args = parser.parse_args()
15 |
16 |
17 | for src in glob(args.src+'/*/*.py') + glob(args.src+'/*.py'):
18 | dst = src.replace(args.src, args.dst)
19 | if run(['diff', src, dst]).returncode != 0:
20 | print('code --diff', src, dst)
21 |
--------------------------------------------------------------------------------
/models/dino/ops/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | # ------------------------------------------------------------------------------------------------
9 |
10 |
11 | # TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='/path/to/your/cuda/dir'
12 | python setup.py build install
13 |
--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn import MSDeformAttn
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn import MSDeformAttn
--------------------------------------------------------------------------------
/tools/merge_dance_tracklets.sh:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 | python tools/merge_dance_tracklets.py $1 $2
6 |
7 | # python3 ../TrackEval/scripts/run_mot_challenge.py \
8 | # --SPLIT_TO_EVAL val \
9 | # --METRICS HOTA \
10 | # --GT_FOLDER /data/datasets/dancetrack/val \
11 | # --SEQMAP_FILE seqmap \
12 | # --SKIP_SPLIT_FOL True \
13 | # --TRACKER_SUB_FOLDER tracker \
14 | # --TRACKERS_TO_EVAL $2 \
15 | # --USE_PARALLEL True \
16 | # --NUM_PARALLEL_CORES 8 \
17 | # --PLOT_CURVES False \
18 | # --TRACKERS_FOLDER '' | tee -a $2/eval.log
19 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn import MSDeformAttn
--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 |
14 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 |
14 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 |
14 |
--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include "ms_deform_attn.h"
12 |
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 |
--------------------------------------------------------------------------------
/models/dino/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include "ms_deform_attn.h"
12 |
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 |
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_log:
--------------------------------------------------------------------------------
1 | # ninja log v5
2 | 8 50326 1682736456000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o 1a7a04fa8aa332bc
3 | 14 91444 1682736491000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o 781d7dd8aea58757
4 | 3 109768 1682736515000000000 /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o 67f3872547af6227
5 |
--------------------------------------------------------------------------------
/tools/resume.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 |
9 | set -o pipefail
10 |
11 | OUTPUT_DIR=$1
12 |
13 | # clean up *.pyc files
14 | rmpyc() {
15 | rm -rf $(find -name __pycache__)
16 | rm -rf $(find -name "*.pyc")
17 | }
18 |
19 | # tar src to avoid future editing
20 | cleanup() {
21 | echo "Packing source code"
22 | rmpyc
23 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
24 | echo " ...Done"
25 | }
26 |
27 |
28 | pushd $OUTPUT_DIR
29 | trap cleanup EXIT
30 |
31 | args=$(cat *.args)
32 | python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py ${args} --resume checkpoint.pth --output_dir . |& tee -a resume.log
33 | popd
34 |
--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor ms_deform_attn_cuda_forward(
15 | const at::Tensor &value,
16 | const at::Tensor &spatial_shapes,
17 | const at::Tensor &level_start_index,
18 | const at::Tensor &sampling_loc,
19 | const at::Tensor &attn_weight,
20 | const int im2col_step);
21 |
22 | std::vector ms_deform_attn_cuda_backward(
23 | const at::Tensor &value,
24 | const at::Tensor &spatial_shapes,
25 | const at::Tensor &level_start_index,
26 | const at::Tensor &sampling_loc,
27 | const at::Tensor &attn_weight,
28 | const at::Tensor &grad_output,
29 | const int im2col_step);
30 |
31 |
--------------------------------------------------------------------------------
/models/dino/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor ms_deform_attn_cuda_forward(
15 | const at::Tensor &value,
16 | const at::Tensor &spatial_shapes,
17 | const at::Tensor &level_start_index,
18 | const at::Tensor &sampling_loc,
19 | const at::Tensor &attn_weight,
20 | const int im2col_step);
21 |
22 | std::vector ms_deform_attn_cuda_backward(
23 | const at::Tensor &value,
24 | const at::Tensor &spatial_shapes,
25 | const at::Tensor &level_start_index,
26 | const at::Tensor &sampling_loc,
27 | const at::Tensor &attn_weight,
28 | const at::Tensor &grad_output,
29 | const int im2col_step);
30 |
31 |
--------------------------------------------------------------------------------
/tools/run_dist_launch.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
7 | # ------------------------------------------------------------------------
8 | # Modified from DETR (https://github.com/facebookresearch/detr)
9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 |
12 |
13 | set -x
14 |
15 | GPUS=$1
16 | RUN_COMMAND=${@:2}
17 | if [ $GPUS -lt 8 ]; then
18 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
19 | else
20 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
21 | fi
22 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
23 | MASTER_PORT=${MASTER_PORT:-"29500"}
24 | NODE_RANK=${NODE_RANK:-0}
25 |
26 | let "NNODES=GPUS/GPUS_PER_NODE"
27 |
28 | python3 ./tools/launch.py \
29 | --nnodes ${NNODES} \
30 | --node_rank ${NODE_RANK} \
31 | --master_addr ${MASTER_ADDR} \
32 | --master_port ${MASTER_PORT} \
33 | --nproc_per_node ${GPUS_PER_NODE} \
34 | ${RUN_COMMAND}
--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 | const at::Tensor &value,
17 | const at::Tensor &spatial_shapes,
18 | const at::Tensor &level_start_index,
19 | const at::Tensor &sampling_loc,
20 | const at::Tensor &attn_weight,
21 | const int im2col_step);
22 |
23 | std::vector
24 | ms_deform_attn_cpu_backward(
25 | const at::Tensor &value,
26 | const at::Tensor &spatial_shapes,
27 | const at::Tensor &level_start_index,
28 | const at::Tensor &sampling_loc,
29 | const at::Tensor &attn_weight,
30 | const at::Tensor &grad_output,
31 | const int im2col_step);
32 |
33 |
34 |
--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 | const at::Tensor &value,
17 | const at::Tensor &spatial_shapes,
18 | const at::Tensor &level_start_index,
19 | const at::Tensor &sampling_loc,
20 | const at::Tensor &attn_weight,
21 | const int im2col_step);
22 |
23 | std::vector
24 | ms_deform_attn_cpu_backward(
25 | const at::Tensor &value,
26 | const at::Tensor &spatial_shapes,
27 | const at::Tensor &level_start_index,
28 | const at::Tensor &sampling_loc,
29 | const at::Tensor &attn_weight,
30 | const at::Tensor &grad_output,
31 | const int im2col_step);
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tools/eval_dance.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 |
7 | set -x
8 |
9 | set -o pipefail
10 |
11 | OUTPUT_DIR=$1
12 |
13 | # clean up *.pyc files
14 | rmpyc() {
15 | rm -rf $(find -name __pycache__)
16 | rm -rf $(find -name "*.pyc")
17 | }
18 |
19 |
20 | cp submit_dance.py $OUTPUT_DIR
21 |
22 | pushd $OUTPUT_DIR
23 |
24 | args=$(cat *.args)
25 | # rlaunch --cpu 8 --gpu 1 --memory 24000 --positive-tags 2080ti -P 13 -- python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker
26 | python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker
27 |
28 | popd
29 |
30 | # python3 ../TrackEval/scripts/run_mot_challenge.py \
31 | # --SPLIT_TO_EVAL val \
32 | # --METRICS HOTA CLEAR Identity \
33 | # --GT_FOLDER /data/datasets/dancetrack/val \
34 | # --SEQMAP_FILE seqmap \
35 | # --SKIP_SPLIT_FOL True \
36 | # --TRACKER_SUB_FOLDER tracker \
37 | # --TRACKERS_TO_EVAL $OUTPUT_DIR \
38 | # --USE_PARALLEL True \
39 | # --NUM_PARALLEL_CORES 8 \
40 | # --PLOT_CURVES False \
41 | # --TRACKERS_FOLDER '' | tee -a $OUTPUT_DIR/eval.log
42 |
--------------------------------------------------------------------------------
/tools/run_dist_slurm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
7 | # ------------------------------------------------------------------------
8 | # Modified from DETR (https://github.com/facebookresearch/detr)
9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 |
12 |
13 | set -x
14 |
15 | PARTITION=$1
16 | JOB_NAME=$2
17 | GPUS=$3
18 | RUN_COMMAND=${@:4}
19 | if [ $GPUS -lt 8 ]; then
20 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
21 | else
22 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
23 | fi
24 | CPUS_PER_TASK=${CPUS_PER_TASK:-4}
25 | SRUN_ARGS=${SRUN_ARGS:-""}
26 |
27 | srun -p ${PARTITION} \
28 | --job-name=${JOB_NAME} \
29 | --gres=gpu:${GPUS_PER_NODE} \
30 | --ntasks=${GPUS} \
31 | --ntasks-per-node=${GPUS_PER_NODE} \
32 | --cpus-per-task=${CPUS_PER_TASK} \
33 | --kill-on-bad-exit=1 \
34 | ${SRUN_ARGS} \
35 | ${RUN_COMMAND}
36 |
37 |
--------------------------------------------------------------------------------
/tools/make_detdb.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 |
6 | from glob import glob
7 | import json
8 | from concurrent.futures import ThreadPoolExecutor
9 | from threading import Lock
10 |
11 | from tqdm import tqdm
12 |
13 | det_db = {}
14 | to_cache = []
15 |
16 | for file in glob("/data/Dataset/mot/crowdhuman/train_image/*.txt"):
17 | to_cache.append(file)
18 |
19 | for file in glob("/data/Dataset/mot/dancetrack/*/*/img1/*.txt"):
20 | to_cache.append(file)
21 |
22 | for file in glob("/data/Dataset/mot/MOT17/images/*/*/img1/*.txt"):
23 | to_cache.append(file)
24 |
25 | for file in glob("/data/Dataset/mot/MOT20/train/*/img1/*.txt"):
26 | to_cache.append(file)
27 |
28 | for file in glob("/data/Dataset/mot/HIE20/train/*/img1/*.txt"):
29 | to_cache.append(file)
30 |
31 | pbar = tqdm(total=len(to_cache))
32 |
33 | mutex = Lock()
34 | def cache(file):
35 | with open(file) as f:
36 | tmp = [l for l in f]
37 | with mutex:
38 | det_db[file] = tmp
39 | pbar.update()
40 |
41 | with ThreadPoolExecutor(max_workers=48) as exe:
42 | for file in to_cache:
43 | exe.submit(cache, file)
44 |
45 | with open("/data/Dataset/mot/det_db_oc_sort_full.json", 'w') as f:
46 | json.dump(det_db, f)
47 |
48 |
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.cpp
3 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.cpp
4 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.cu
5 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/vision.cpp
6 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cpu/ms_deform_attn_cpu.cpp
7 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cuda/ms_deform_attn_cuda.cu
8 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp
9 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp
10 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu
11 | MultiScaleDeformableAttention.egg-info/PKG-INFO
12 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
13 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
14 | MultiScaleDeformableAttention.egg-info/top_level.txt
15 | functions/__init__.py
16 | functions/ms_deform_attn_func.py
17 | modules/__init__.py
18 | modules/ms_deform_attn.py
--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 |
13 | #include
14 | #include
15 |
16 |
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 | const at::Tensor &value,
20 | const at::Tensor &spatial_shapes,
21 | const at::Tensor &level_start_index,
22 | const at::Tensor &sampling_loc,
23 | const at::Tensor &attn_weight,
24 | const int im2col_step)
25 | {
26 | AT_ERROR("Not implement on cpu");
27 | }
28 |
29 | std::vector
30 | ms_deform_attn_cpu_backward(
31 | const at::Tensor &value,
32 | const at::Tensor &spatial_shapes,
33 | const at::Tensor &level_start_index,
34 | const at::Tensor &sampling_loc,
35 | const at::Tensor &attn_weight,
36 | const at::Tensor &grad_output,
37 | const int im2col_step)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 |
13 | #include
14 | #include
15 |
16 |
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 | const at::Tensor &value,
20 | const at::Tensor &spatial_shapes,
21 | const at::Tensor &level_start_index,
22 | const at::Tensor &sampling_loc,
23 | const at::Tensor &attn_weight,
24 | const int im2col_step)
25 | {
26 | AT_ERROR("Not implement on cpu");
27 | }
28 |
29 | std::vector
30 | ms_deform_attn_cpu_backward(
31 | const at::Tensor &value,
32 | const at::Tensor &spatial_shapes,
33 | const at::Tensor &level_start_index,
34 | const at::Tensor &sampling_loc,
35 | const at::Tensor &attn_weight,
36 | const at::Tensor &grad_output,
37 | const int im2col_step)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/tools/coco_evel.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import numpy as np
4 | from collections import defaultdict
5 | from pycocotools.coco import COCO
6 | from pycocotools.cocoeval import COCOeval
7 |
8 |
9 | parser = argparse.ArgumentParser('Deformable DETR Detector', add_help=False)
10 | parser.add_argument('--det_root', default='tracker', type=str)
11 | args = parser.parse_args()
12 |
13 | cocoGt = COCO(annotation_file='data/dancetrack/annotations/val.json')
14 |
15 |
16 | det_root = args.det_root
17 | tracklets = defaultdict()
18 |
19 | detRes = []
20 | for img_id in cocoGt.getImgIds():
21 | img = cocoGt.loadImgs(img_id)
22 |
23 | vid_name = img[0]['file_name'][:14]
24 | frame_id = img[0]['frame_id']
25 |
26 | if vid_name not in tracklets:
27 | tracklets[vid_name] = defaultdict(list)
28 | for line in open(os.path.join(det_root, vid_name+'.txt')):
29 | t, id, *xywhs = line.split(',')[:7]
30 | t, id = map(int, (t, id))
31 | tracklets[vid_name][t].append((id, *map(float, xywhs)))
32 |
33 | labels = tracklets[vid_name][frame_id]
34 |
35 | for l in labels:
36 | ann = defaultdict()
37 | ann['image_id'] = img[0]['id']
38 | ann['bbox'] = list(l[1:5])
39 | ann['category_id'] = 1
40 | ann['score'] = l[5]
41 | detRes.append(ann)
42 |
43 | cocoDt = cocoGt.loadRes(detRes) #自己的生成的结果的路径及文件名,json文件形式
44 | cocoEval = COCOeval(cocoGt, cocoDt, "bbox")
45 | cocoEval.evaluate()
46 | cocoEval.accumulate()
47 | cocoEval.summarize()
--------------------------------------------------------------------------------
/tools/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 | # 打印所有指令
7 | set -x
8 |
9 | PY_ARGS=${@:2}
10 |
11 | # 脚本运行失败,报错
12 | set -o pipefail
13 | #sed -e :直接在指令列模式上進行 sed 的動作編輯;
14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g")
15 | mkdir -p $OUTPUT_BASE
16 |
17 | for RUN in $(seq 100); do
18 | ls $OUTPUT_BASE | grep run$RUN && continue
19 | OUTPUT_DIR=$OUTPUT_BASE/run$RUN
20 | mkdir $OUTPUT_DIR && break
21 | done
22 |
23 | # clean up *.pyc files
24 | rmpyc() {
25 | rm -rf $(find -name __pycache__)
26 | rm -rf $(find -name "*.pyc")
27 | }
28 |
29 | # run backup
30 | echo "Backing up to log dir: $OUTPUT_DIR"
31 | rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR
32 | echo " ...Done"
33 |
34 | # tar src to avoid future editing
35 | cleanup() {
36 | echo "Packing source code"
37 | rmpyc
38 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
39 | echo " ...Done"
40 | }
41 |
42 | args=$(cat $1)
43 |
44 | pushd $OUTPUT_DIR
45 | trap cleanup EXIT
46 |
47 | # log git status
48 | echo "Logging git status"
49 | git status > git_status
50 | git rev-parse HEAD > git_tag
51 | git diff > git_diff
52 | echo $PY_ARGS > desc
53 | echo " ...Done"
54 |
55 | python -m torch.distributed.launch --nproc_per_node=4 --master_port 29504 --use_env main.py ${args} --output_dir $OUTPUT_DIR |& tee -a output.log
56 |
--------------------------------------------------------------------------------
/tools/clip_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import numpy as np
4 | from collections import defaultdict
5 |
6 |
7 | root_data = 'data/MOT/MOT17_all/train'
8 | vids = os.listdir(root_data)
9 |
10 | for v in vids:
11 | if 'SDP' in v:
12 | labels_full = defaultdict(list)
13 | gt_path = os.path.join(root_data, v, 'gt', 'gt.txt')
14 | for l in open(gt_path):
15 | t, i, *xywh = l.strip().split(',')
16 | labels_full[int(t)].append([i, *xywh])
17 | imgs_root = os.path.join(root_data, v, 'img1')
18 | imgs_path = sorted(os.listdir(imgs_root))
19 |
20 | for ith, img_p in enumerate(imgs_path):
21 | if ith < (len(imgs_path)+1)//2:
22 | save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17')
23 | save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17')
24 | print('train: %d', save_img)
25 | else:
26 | save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17').replace('train', 'val')
27 | save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17').replace('train', 'val')
28 | print('val: %d', save_img)
29 | os.makedirs(os.path.dirname(save_label), exist_ok=True)
30 | with open(save_label, 'a+') as f:
31 | if ith+1 in labels_full:
32 | for l in labels_full[ith+1]:
33 | f.write('%d,%s,%s,%s,%s,%s,%s,%s,%s\n'%(ith+1, *l))
34 | img = cv2.imread(os.path.join(imgs_root, img_p))
35 | os.makedirs(os.path.dirname(save_img), exist_ok=True)
36 | cv2.imwrite(save_img, img)
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 | from .dance import build as build_e2e_dance
12 | from .dance_test import build as build_e2e_dance_test
13 | from .tao import build as build_e2e_tao
14 | from .joint import build as build_e2e_joint
15 | from .mot import build as build_e2e_mot
16 | from .all import build as build_e2e_all
17 | from .bdd100k import build as build_e2e_bdd
18 | from .bdd100kcoco import build as build_e2e_bddcc
19 |
20 |
21 | def build_dataset(image_set, args):
22 | if args.dataset_file == 'e2e_joint':
23 | return build_e2e_joint(image_set, args)
24 | elif args.dataset_file == 'e2e_dance':
25 | return build_e2e_dance(image_set, args)
26 | elif args.dataset_file == 'e2e_dance_test':
27 | return build_e2e_dance_test(image_set, args)
28 | elif args.dataset_file == 'e2e_all':
29 | return build_e2e_all(image_set, args)
30 | elif args.dataset_file == 'e2e_bdd':
31 | return build_e2e_bdd(image_set, args)
32 | elif args.dataset_file == 'e2e_tao':
33 | return build_e2e_tao(image_set, args)
34 | elif args.dataset_file == 'e2e_bddcc':
35 | return build_e2e_bddcc(image_set, args)
36 | elif args.dataset_file == 'e2e_mot':
37 | return build_e2e_mot(image_set, args)
38 | raise ValueError(f'dataset {args.dataset_file} not supported')
39 |
--------------------------------------------------------------------------------
/util/checkpoint.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from pytorch-checkpoint (https://github.com/csrhddlam/pytorch-checkpoint)
5 | # ------------------------------------------------------------------------
6 |
7 | import torch
8 |
9 |
10 | def check_require_grad(t):
11 | return isinstance(t, torch.Tensor) and t.requires_grad
12 |
13 |
14 | class CheckpointFunction(torch.autograd.Function):
15 | @staticmethod
16 | def forward(ctx, run_function, length, *args):
17 | ctx.run_function = run_function
18 | ctx.input_tensors = list(args[:length])
19 | ctx.input_params = list(args[length:])
20 | with torch.no_grad():
21 | output_tensors = ctx.run_function(*ctx.input_tensors)
22 | return output_tensors
23 |
24 | @staticmethod
25 | def backward(ctx, *output_grads):
26 | for i in range(len(ctx.input_tensors)):
27 | temp = ctx.input_tensors[i]
28 | if check_require_grad(temp):
29 | ctx.input_tensors[i] = temp.detach()
30 | ctx.input_tensors[i].requires_grad = temp.requires_grad
31 | with torch.enable_grad():
32 | output_tensors = ctx.run_function(*ctx.input_tensors)
33 | to_autograd = list(filter(check_require_grad, ctx.input_tensors))
34 | output_tensors, output_grads = zip(*filter(lambda t: t[0].requires_grad, zip(output_tensors, output_grads)))
35 | input_grads = torch.autograd.grad(output_tensors, to_autograd + ctx.input_params, output_grads, allow_unused=True)
36 | input_grads = list(input_grads)
37 | for i in range(len(ctx.input_tensors)):
38 | if not check_require_grad(ctx.input_tensors[i]):
39 | input_grads.insert(i, None)
40 | return (None, None) + tuple(input_grads)
41 |
--------------------------------------------------------------------------------
/models/registry.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Yihao Chen
3 | # @Date: 2021-08-16 16:03:17
4 | # @Last Modified by: Shilong Liu
5 | # @Last Modified time: 2022-01-23 15:26
6 | # modified from mmcv
7 |
8 | import inspect
9 | from functools import partial
10 |
11 |
12 | class Registry(object):
13 |
14 | def __init__(self, name):
15 | self._name = name
16 | self._module_dict = dict()
17 |
18 | def __repr__(self):
19 | format_str = self.__class__.__name__ + '(name={}, items={})'.format(
20 | self._name, list(self._module_dict.keys()))
21 | return format_str
22 |
23 | def __len__(self):
24 | return len(self._module_dict)
25 |
26 | @property
27 | def name(self):
28 | return self._name
29 |
30 | @property
31 | def module_dict(self):
32 | return self._module_dict
33 |
34 | def get(self, key):
35 | return self._module_dict.get(key, None)
36 |
37 | def registe_with_name(self, module_name=None, force=False):
38 | return partial(self.register, module_name=module_name, force=force)
39 |
40 | def register(self, module_build_function, module_name=None, force=False):
41 | """Register a module build function.
42 | Args:
43 | module (:obj:`nn.Module`): Module to be registered.
44 | """
45 | if not inspect.isfunction(module_build_function):
46 | raise TypeError('module_build_function must be a function, but got {}'.format(
47 | type(module_build_function)))
48 | if module_name is None:
49 | module_name = module_build_function.__name__
50 | if not force and module_name in self._module_dict:
51 | raise KeyError('{} is already registered in {}'.format(
52 | module_name, self.name))
53 | self._module_dict[module_name] = module_build_function
54 |
55 | return module_build_function
56 |
57 | MODULE_BUILD_FUNCS = Registry('model build functions')
58 |
59 |
--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 |
13 | #include "cpu/ms_deform_attn_cpu.h"
14 |
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 |
19 |
20 | at::Tensor
21 | ms_deform_attn_forward(
22 | const at::Tensor &value,
23 | const at::Tensor &spatial_shapes,
24 | const at::Tensor &level_start_index,
25 | const at::Tensor &sampling_loc,
26 | const at::Tensor &attn_weight,
27 | const int im2col_step)
28 | {
29 | if (value.type().is_cuda())
30 | {
31 | #ifdef WITH_CUDA
32 | return ms_deform_attn_cuda_forward(
33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | ms_deform_attn_backward(
43 | const at::Tensor &value,
44 | const at::Tensor &spatial_shapes,
45 | const at::Tensor &level_start_index,
46 | const at::Tensor &sampling_loc,
47 | const at::Tensor &attn_weight,
48 | const at::Tensor &grad_output,
49 | const int im2col_step)
50 | {
51 | if (value.type().is_cuda())
52 | {
53 | #ifdef WITH_CUDA
54 | return ms_deform_attn_cuda_backward(
55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 | AT_ERROR("Not compiled with GPU support");
58 | #endif
59 | }
60 | AT_ERROR("Not implemented on the CPU");
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/models/dino/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 |
13 | #include "cpu/ms_deform_attn_cpu.h"
14 |
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 |
19 |
20 | at::Tensor
21 | ms_deform_attn_forward(
22 | const at::Tensor &value,
23 | const at::Tensor &spatial_shapes,
24 | const at::Tensor &level_start_index,
25 | const at::Tensor &sampling_loc,
26 | const at::Tensor &attn_weight,
27 | const int im2col_step)
28 | {
29 | if (value.type().is_cuda())
30 | {
31 | #ifdef WITH_CUDA
32 | return ms_deform_attn_cuda_forward(
33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | ms_deform_attn_backward(
43 | const at::Tensor &value,
44 | const at::Tensor &spatial_shapes,
45 | const at::Tensor &level_start_index,
46 | const at::Tensor &sampling_loc,
47 | const at::Tensor &attn_weight,
48 | const at::Tensor &grad_output,
49 | const int im2col_step)
50 | {
51 | if (value.type().is_cuda())
52 | {
53 | #ifdef WITH_CUDA
54 | return ms_deform_attn_cuda_backward(
55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 | AT_ERROR("Not compiled with GPU support");
58 | #endif
59 | }
60 | AT_ERROR("Not implemented on the CPU");
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | import json
13 | import os
14 |
15 | import util.misc as utils
16 |
17 | try:
18 | from panopticapi.evaluation import pq_compute
19 | except ImportError:
20 | pass
21 |
22 |
23 | class PanopticEvaluator(object):
24 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
25 | self.gt_json = ann_file
26 | self.gt_folder = ann_folder
27 | if utils.is_main_process():
28 | if not os.path.exists(output_dir):
29 | os.mkdir(output_dir)
30 | self.output_dir = output_dir
31 | self.predictions = []
32 |
33 | def update(self, predictions):
34 | for p in predictions:
35 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
36 | f.write(p.pop("png_string"))
37 |
38 | self.predictions += predictions
39 |
40 | def synchronize_between_processes(self):
41 | all_predictions = utils.all_gather(self.predictions)
42 | merged_predictions = []
43 | for p in all_predictions:
44 | merged_predictions += p
45 | self.predictions = merged_predictions
46 |
47 | def summarize(self):
48 | if utils.is_main_process():
49 | json_data = {"annotations": self.predictions}
50 | predictions_json = os.path.join(self.output_dir, "predictions.json")
51 | with open(predictions_json, "w") as f:
52 | f.write(json.dumps(json_data))
53 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
54 | return None
55 |
--------------------------------------------------------------------------------
/tools/merge_dance_tracklets.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 |
6 | import argparse
7 | from collections import defaultdict
8 | import os
9 | from pathlib import Path
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('input_dir', type=Path)
13 | parser.add_argument('output_dir', type=Path)
14 | parser.add_argument('--t_min', default=20)
15 | parser.add_argument('--t_max', default=100)
16 | args = parser.parse_args()
17 |
18 |
19 | class FindUnionSet(dict):
20 | def find(self, src):
21 | if src in self:
22 | return self.find(self[src])
23 | return src
24 |
25 | def merge(self, dst, src):
26 | self[self.find(src)] = self.find(dst)
27 |
28 |
29 | for seq in os.listdir(args.input_dir):
30 | print(args.input_dir / seq)
31 | with open(args.input_dir / seq) as f:
32 | lines = f.readlines()
33 | instance_timestamps = defaultdict(list)
34 | for line in lines:
35 | f_id, id = map(int, line.split(',')[:2])
36 | instance_timestamps[id].append(f_id)
37 | instances = list(instance_timestamps.keys())
38 | fid_map = FindUnionSet()
39 | for i in instances:
40 | for j in instances:
41 | if fid_map.find(i) == fid_map.find(j):
42 | continue
43 | end_t = max(instance_timestamps[i])
44 | start_t = min(instance_timestamps[j])
45 | if sum([0 <= start_t - max(pts) < args.t_max for pts in instance_timestamps.values()]) > 1:
46 | continue
47 | if sum([0 <= min(pts) - end_t < args.t_max for pts in instance_timestamps.values()]) > 1:
48 | continue
49 | dt = start_t - end_t
50 | if args.t_min < dt < args.t_max:
51 | print(f"{i}<-{j}", end_t, start_t, start_t - end_t)
52 | fid_map.merge(i, j)
53 |
54 | os.makedirs(args.output_dir / 'tracker', exist_ok=True)
55 | with open(args.output_dir / 'tracker' / seq, 'w') as f:
56 | for line in lines:
57 | f_id, id, *info = line.split(',')
58 | id = str(fid_map.find(int(id)))
59 | f.write(','.join([f_id, id, *info]))
60 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 |
10 | from .deformable_transformer_plus import DeformableTransformer
11 | from .deformable_transformer_cross import DeformableTransformer as DeformableTransformerCross
12 | from .ftransformer import DetrTransformerDecoder
13 | def build_deforamble_transformer(args):
14 | arch_catalog = {
15 | 'DeformableTransformer': DeformableTransformer,
16 | 'DeformableTransformerCross': DeformableTransformerCross,
17 | }
18 | assert args.trans_mode in arch_catalog, 'invalid arch: {}'.format(args.trans_mode)
19 | build_func = arch_catalog[args.trans_mode]
20 |
21 | return build_func(
22 | d_model=args.hidden_dim,
23 | nhead=args.nheads,
24 | num_encoder_layers=args.enc_layers,
25 | num_decoder_layers=args.dec_layers,
26 | dim_feedforward=args.dim_feedforward,
27 | dropout=args.dropout,
28 | activation="relu",
29 | return_intermediate_dec=True,
30 | num_feature_levels=args.num_feature_levels,
31 | dec_n_points=args.dec_n_points,
32 | enc_n_points=args.enc_n_points,
33 | two_stage=args.two_stage,
34 | two_stage_num_proposals=args.num_queries,
35 | decoder_self_cross=not args.decoder_cross_self,
36 | sigmoid_attn=args.sigmoid_attn,
37 | extra_track_attn=args.extra_track_attn,
38 | memory_bank=args.memory_bank_type == 'MemoryBankFeat'
39 | )
40 |
41 |
42 | from .motr import build as build_motr
43 | from .motr_uninC import build as build_motr_uninC
44 | from .motr_uninCost import build as build_motr_uninCost
45 |
46 |
47 | from .tmotr_uni import build as build_tmotr_uni
48 |
49 | def build_model(args):
50 | arch_catalog = {
51 | 'motr': build_motr,
52 | 'motr_uninc': build_motr_uninC,
53 | 'motr_unincost': build_motr_uninCost,
54 | 'tmotr_uni': build_tmotr_uni,
55 | }
56 | assert args.meta_arch in arch_catalog, 'invalid arch: {}'.format(args.meta_arch)
57 | build_func = arch_catalog[args.meta_arch]
58 | return build_func(args)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/models/yolo_fpn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | import torch
6 | import torch.nn as nn
7 |
8 | from .darknet import Darknet
9 | from .network_blocks import BaseConv
10 |
11 |
12 | class YOLOFPN(nn.Module):
13 | """
14 | YOLOFPN module. Darknet 53 is the default backbone of this model.
15 | """
16 |
17 | def __init__(
18 | self,
19 | depth=53,
20 | in_features=["dark3", "dark4", "dark5"],
21 | ):
22 | super().__init__()
23 |
24 | self.backbone = Darknet(depth)
25 | self.in_features = in_features
26 |
27 | # out 1
28 | self.out1_cbl = self._make_cbl(512, 256, 1)
29 | self.out1 = self._make_embedding([256, 512], 512 + 256)
30 |
31 | # out 2
32 | self.out2_cbl = self._make_cbl(256, 128, 1)
33 | self.out2 = self._make_embedding([128, 256], 256 + 128)
34 |
35 | # upsample
36 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
37 |
38 | def _make_cbl(self, _in, _out, ks):
39 | return BaseConv(_in, _out, ks, stride=1, act="lrelu")
40 |
41 | def _make_embedding(self, filters_list, in_filters):
42 | m = nn.Sequential(
43 | *[
44 | self._make_cbl(in_filters, filters_list[0], 1),
45 | self._make_cbl(filters_list[0], filters_list[1], 3),
46 | self._make_cbl(filters_list[1], filters_list[0], 1),
47 | self._make_cbl(filters_list[0], filters_list[1], 3),
48 | self._make_cbl(filters_list[1], filters_list[0], 1),
49 | ]
50 | )
51 | return m
52 |
53 | def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
54 | with open(filename, "rb") as f:
55 | state_dict = torch.load(f, map_location="cpu")
56 | print("loading pretrained weights...")
57 | self.backbone.load_state_dict(state_dict)
58 |
59 | def forward(self, inputs):
60 | """
61 | Args:
62 | inputs (Tensor): input image.
63 |
64 | Returns:
65 | Tuple[Tensor]: FPN output features..
66 | """
67 | # backbone
68 | out_features = self.backbone(inputs)
69 | x2, x1, x0 = [out_features[f] for f in self.in_features]
70 |
71 | # yolo branch 1
72 | x1_in = self.out1_cbl(x0)
73 | x1_in = self.upsample(x1_in)
74 | x1_in = torch.cat([x1_in, x1], 1)
75 | out_dark4 = self.out1(x1_in)
76 |
77 | # yolo branch 2
78 | x2_in = self.out2_cbl(out_dark4)
79 | x2_in = self.upsample(x2_in)
80 | x2_in = torch.cat([x2_in, x2], 1)
81 | out_dark3 = self.out2(x2_in)
82 |
83 | outputs = (out_dark3, out_dark4, x0)
84 | return outputs
85 |
--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | import os
10 | import glob
11 |
12 | import torch
13 |
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 |
18 | from setuptools import find_packages
19 | from setuptools import setup
20 |
21 | requirements = ["torch", "torchvision"]
22 |
23 | def get_extensions():
24 | this_dir = os.path.dirname(os.path.abspath(__file__))
25 | extensions_dir = os.path.join(this_dir, "src")
26 |
27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 |
31 | sources = main_file + source_cpu
32 | extension = CppExtension
33 | extra_compile_args = {"cxx": []}
34 | define_macros = []
35 |
36 | if torch.cuda.is_available() and CUDA_HOME is not None:
37 | extension = CUDAExtension
38 | sources += source_cuda
39 | define_macros += [("WITH_CUDA", None)]
40 | extra_compile_args["nvcc"] = [
41 | "-DCUDA_HAS_FP16=1",
42 | "-D__CUDA_NO_HALF_OPERATORS__",
43 | "-D__CUDA_NO_HALF_CONVERSIONS__",
44 | "-D__CUDA_NO_HALF2_OPERATORS__",
45 | ]
46 | else:
47 | raise NotImplementedError('Cuda is not availabel')
48 |
49 | sources = [os.path.join(extensions_dir, s) for s in sources]
50 | include_dirs = [extensions_dir]
51 | ext_modules = [
52 | extension(
53 | "MultiScaleDeformableAttention",
54 | sources,
55 | include_dirs=include_dirs,
56 | define_macros=define_macros,
57 | extra_compile_args=extra_compile_args,
58 | )
59 | ]
60 | return ext_modules
61 |
62 | setup(
63 | name="MultiScaleDeformableAttention",
64 | version="1.0",
65 | author="Weijie Su",
66 | url="https://github.com/fundamentalvision/Deformable-DETR",
67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 | packages=find_packages(exclude=("configs", "tests",)),
69 | ext_modules=get_extensions(),
70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 |
--------------------------------------------------------------------------------
/models/dino/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | import os
10 | import glob
11 |
12 | import torch
13 |
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 |
18 | from setuptools import find_packages
19 | from setuptools import setup
20 |
21 | requirements = ["torch", "torchvision"]
22 |
23 | def get_extensions():
24 | this_dir = os.path.dirname(os.path.abspath(__file__))
25 | extensions_dir = os.path.join(this_dir, "src")
26 |
27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 |
31 | sources = main_file + source_cpu
32 | extension = CppExtension
33 | extra_compile_args = {"cxx": []}
34 | define_macros = []
35 |
36 |
37 |
38 | if torch.cuda.is_available() and CUDA_HOME is not None:
39 | extension = CUDAExtension
40 | sources += source_cuda
41 | define_macros += [("WITH_CUDA", None)]
42 | extra_compile_args["nvcc"] = [
43 | "-DCUDA_HAS_FP16=1",
44 | "-D__CUDA_NO_HALF_OPERATORS__",
45 | "-D__CUDA_NO_HALF_CONVERSIONS__",
46 | "-D__CUDA_NO_HALF2_OPERATORS__",
47 | ]
48 | else:
49 | raise NotImplementedError('Cuda is not availabel')
50 |
51 | sources = [os.path.join(extensions_dir, s) for s in sources]
52 | include_dirs = [extensions_dir]
53 | ext_modules = [
54 | extension(
55 | "MultiScaleDeformableAttention",
56 | sources,
57 | include_dirs=include_dirs,
58 | define_macros=define_macros,
59 | extra_compile_args=extra_compile_args,
60 | )
61 | ]
62 | return ext_modules
63 |
64 | setup(
65 | name="MultiScaleDeformableAttention",
66 | version="1.0",
67 | author="Weijie Su",
68 | url="https://github.com/fundamentalvision/Deformable-DETR",
69 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
70 | packages=find_packages(exclude=("configs", "tests",)),
71 | ext_modules=get_extensions(),
72 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
73 | )
74 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // 使用 IntelliSense 了解相关属性。
3 | // 悬停以查看现有属性的描述。
4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python: Current File",
9 | "type": "python",
10 | "request": "launch",
11 | "program": "${file}",
12 | "console": "integratedTerminal",
13 | "justMyCode": true,
14 | "env": {"CUDA_VISIBLE_DEVICES":"0", "CUBLAS_WORKSPACE_CONFIG":":4096:8"},
15 | // "args": ["--meta_arch", "motr_unincost", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "60", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "gmatch", "--g_size", "3", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/exps/motrv2ch_uni5cost3ggoon/run1/checkpoint.pth"]
16 |
17 | // "args": ["--meta_arch", "dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth"]
18 | "args": ["--meta_arch", "mot_dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048"]
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/models/losses.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 |
9 |
10 | class IOUloss(nn.Module):
11 | def __init__(self, reduction="none", loss_type="iou"):
12 | super(IOUloss, self).__init__()
13 | self.reduction = reduction
14 | self.loss_type = loss_type
15 |
16 | def forward(self, pred, target):
17 | assert pred.shape[0] == target.shape[0]
18 |
19 | pred = pred.view(-1, 4)
20 | target = target.view(-1, 4)
21 | tl = torch.max(
22 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
23 | )
24 | br = torch.min(
25 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
26 | )
27 |
28 | area_p = torch.prod(pred[:, 2:], 1)
29 | area_g = torch.prod(target[:, 2:], 1)
30 |
31 | en = (tl < br).type(tl.type()).prod(dim=1)
32 | area_i = torch.prod(br - tl, 1) * en
33 | iou = (area_i) / (area_p + area_g - area_i + 1e-16)
34 |
35 | if self.loss_type == "iou":
36 | loss = 1 - iou ** 2
37 | elif self.loss_type == "giou":
38 | c_tl = torch.min(
39 | (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
40 | )
41 | c_br = torch.max(
42 | (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
43 | )
44 | area_c = torch.prod(c_br - c_tl, 1)
45 | giou = iou - (area_c - area_i) / area_c.clamp(1e-16)
46 | loss = 1 - giou.clamp(min=-1.0, max=1.0)
47 |
48 | if self.reduction == "mean":
49 | loss = loss.mean()
50 | elif self.reduction == "sum":
51 | loss = loss.sum()
52 |
53 | return loss
54 |
55 |
56 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
57 | """
58 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
59 | Args:
60 | inputs: A float tensor of arbitrary shape.
61 | The predictions for each example.
62 | targets: A float tensor with the same shape as inputs. Stores the binary
63 | classification label for each element in inputs
64 | (0 for the negative class and 1 for the positive class).
65 | alpha: (optional) Weighting factor in range (0,1) to balance
66 | positive vs negative examples. Default = -1 (no weighting).
67 | gamma: Exponent of the modulating factor (1 - p_t) to
68 | balance easy vs hard examples.
69 | Returns:
70 | Loss tensor
71 | """
72 | prob = inputs.sigmoid()
73 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
74 | p_t = prob * targets + (1 - prob) * (1 - targets)
75 | loss = ce_loss * ((1 - p_t) ** gamma)
76 |
77 | if alpha >= 0:
78 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
79 | loss = alpha_t * loss
80 | #return loss.mean(0).sum() / num_boxes
81 | return loss.sum() / num_boxes
--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | """
13 | Utilities for bounding box manipulation and GIoU.
14 | """
15 | import torch
16 | from torchvision.ops.boxes import box_area
17 |
18 |
19 | def box_cxcywh_to_xyxy(x):
20 | x_c, y_c, w, h = x.unbind(-1)
21 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
22 | (x_c + 0.5 * w), (y_c + 0.5 * h)]
23 | return torch.stack(b, dim=-1)
24 |
25 |
26 | def box_xyxy_to_cxcywh(x):
27 | x0, y0, x1, y1 = x.unbind(-1)
28 | b = [(x0 + x1) / 2, (y0 + y1) / 2,
29 | (x1 - x0), (y1 - y0)]
30 | return torch.stack(b, dim=-1)
31 |
32 |
33 | # modified from torchvision to also return the union
34 | def box_iou(boxes1, boxes2):
35 | area1 = box_area(boxes1)
36 | area2 = box_area(boxes2)
37 |
38 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
39 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
40 |
41 | wh = (rb - lt).clamp(min=0) # [N,M,2]
42 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
43 |
44 | union = area1[:, None] + area2 - inter
45 |
46 | iou = inter / union
47 | return iou, union
48 |
49 |
50 | def generalized_box_iou(boxes1, boxes2):
51 | """
52 | Generalized IoU from https://giou.stanford.edu/
53 |
54 | The boxes should be in [x0, y0, x1, y1] format
55 |
56 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
57 | and M = len(boxes2)
58 | """
59 | # degenerate boxes gives inf / nan results
60 | # so do an early check
61 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
62 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
63 | iou, union = box_iou(boxes1, boxes2)
64 |
65 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
66 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
67 |
68 | wh = (rb - lt).clamp(min=0) # [N,M,2]
69 | area = wh[:, :, 0] * wh[:, :, 1]
70 |
71 | return iou - (area - union) / area
72 |
73 |
74 | def masks_to_boxes(masks):
75 | """Compute the bounding boxes around the provided masks
76 |
77 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
78 |
79 | Returns a [N, 4] tensors, with the boxes in xyxy format
80 | """
81 | if masks.numel() == 0:
82 | return torch.zeros((0, 4), device=masks.device)
83 |
84 | h, w = masks.shape[-2:]
85 |
86 | y = torch.arange(0, h, dtype=torch.float)
87 | x = torch.arange(0, w, dtype=torch.float)
88 | y, x = torch.meshgrid(y, x)
89 |
90 | x_mask = (masks * x.unsqueeze(0))
91 | x_max = x_mask.flatten(1).max(-1)[0]
92 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
93 |
94 | y_mask = (masks * y.unsqueeze(0))
95 | y_max = y_mask.flatten(1).max(-1)[0]
96 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
97 |
98 | return torch.stack([x_min, y_min, x_max, y_max], 1)
99 |
--------------------------------------------------------------------------------
/models/dino/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 |
18 | import MultiScaleDeformableAttention as MSDA
19 |
20 |
21 | class MSDeformAttnFunction(Function):
22 | @staticmethod
23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 | ctx.im2col_step = im2col_step
25 | output = MSDA.ms_deform_attn_forward(
26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 | return output
29 |
30 | @staticmethod
31 | @once_differentiable
32 | def backward(ctx, grad_output):
33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 | grad_value, grad_sampling_loc, grad_attn_weight = \
35 | MSDA.ms_deform_attn_backward(
36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 |
38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 |
40 |
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 | # for debug and test only,
43 | # need to use cuda version instead
44 | N_, S_, M_, D_ = value.shape
45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 | sampling_grids = 2 * sampling_locations - 1
48 | sampling_value_list = []
49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 | # N_*M_, D_, Lq_, P_
55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 | mode='bilinear', padding_mode='zeros', align_corners=False)
57 | sampling_value_list.append(sampling_value_l_)
58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 | return output.transpose(1, 2).contiguous()
62 |
--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | import MultiScaleDeformableAttention as MSDA
22 |
23 |
24 | class MSDeformAttnFunction(Function):
25 | @staticmethod
26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 | ctx.im2col_step = im2col_step
28 | output = MSDA.ms_deform_attn_forward(
29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 | return output
32 |
33 | @staticmethod
34 | @once_differentiable
35 | def backward(ctx, grad_output):
36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 | grad_value, grad_sampling_loc, grad_attn_weight = \
38 | MSDA.ms_deform_attn_backward(
39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 |
41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 |
43 |
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 | # for debug and test only,
46 | # need to use cuda version instead
47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel
48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2
49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value
50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1]
51 | sampling_value_list = []
52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 | # N_*M_, D_, Lq_, P_
58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 | mode='bilinear', padding_mode='zeros', align_corners=False)
60 | sampling_value_list.append(sampling_value_l_)
61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 | return output.transpose(1, 2).contiguous()
65 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | import MultiScaleDeformableAttention as MSDA
22 |
23 |
24 | class MSDeformAttnFunction(Function):
25 | @staticmethod
26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 | ctx.im2col_step = im2col_step
28 | output = MSDA.ms_deform_attn_forward(
29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 | return output
32 |
33 | @staticmethod
34 | @once_differentiable
35 | def backward(ctx, grad_output):
36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 | grad_value, grad_sampling_loc, grad_attn_weight = \
38 | MSDA.ms_deform_attn_backward(
39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 |
41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 |
43 |
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 | # for debug and test only,
46 | # need to use cuda version instead
47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel
48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2
49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value
50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1]
51 | sampling_value_list = []
52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 | # N_*M_, D_, Lq_, P_
58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 | mode='bilinear', padding_mode='zeros', align_corners=False)
60 | sampling_value_list.append(sampling_value_l_)
61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 | return output.transpose(1, 2).contiguous()
65 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | import MultiScaleDeformableAttention as MSDA
22 |
23 |
24 | class MSDeformAttnFunction(Function):
25 | @staticmethod
26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 | ctx.im2col_step = im2col_step
28 | output = MSDA.ms_deform_attn_forward(
29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 | return output
32 |
33 | @staticmethod
34 | @once_differentiable
35 | def backward(ctx, grad_output):
36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 | grad_value, grad_sampling_loc, grad_attn_weight = \
38 | MSDA.ms_deform_attn_backward(
39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 |
41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 |
43 |
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 | # for debug and test only,
46 | # need to use cuda version instead
47 | N_, S_, M_, D_ = value.shape # batch, pixel, multi head, channel
48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # batch, pixel, multi head, n_levels, n_points, 2
49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) # 按level拆分value
50 | sampling_grids = 2 * sampling_locations - 1 # [-1,1]
51 | sampling_value_list = []
52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 | # N_*M_, D_, Lq_, P_
58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 | mode='bilinear', padding_mode='zeros', align_corners=False)
60 | sampling_value_list.append(sampling_value_l_)
61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 | return output.transpose(1, 2).contiguous()
65 |
--------------------------------------------------------------------------------
/tools/train_ddp.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------
3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 | # 打印所有指令
7 | set -x
8 |
9 | PY_ARGS=${@:2}
10 |
11 | # 脚本运行失败,报错
12 | set -o pipefail
13 | #sed -e :直接在指令列模式上進行 sed 的動作編輯;
14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g")
15 | mkdir -p $OUTPUT_BASE
16 |
17 |
18 | cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
19 | echo "cluster spec is $cluster_spec"
20 | worker_list_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"worker\"))"
21 | echo "worker list command is $worker_list_command"
22 | eval worker_list=`python -c "$worker_list_command"`
23 | echo "worker list is $worker_list"
24 | worker_strs=(${worker_list//,/ })
25 | master=${worker_strs[0]}
26 | echo "master is $master"
27 | master_strs=(${master//:/ })
28 | master_addr=${master_strs[0]}
29 | master_port=${master_strs[1]}
30 | echo "master address is $master_addr"
31 | echo "master port is $master_port"
32 | index_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"index\"))"
33 | eval node_rank=`python -c "$index_command"`
34 | echo "node rank is $node_rank"
35 | dist_url="tcp://$master_addr:$master_port"
36 | echo "dist url is $dist_url"
37 | PYTHONPATH=$PYTHONPATH:../ \
38 | # python tools/run_net.py \
39 | # --num_shards 8 \
40 | # --shard_id $node_rank \
41 | # --dist_url $dist_url \
42 | # --cfg configs/verb/MVIT_B_32x2_CONV.yaml
43 |
44 | MASTER_ADDR=${MASTER_ADDR:-$master_addr}
45 | MASTER_PORT=${MASTER_PORT:-$master_port}
46 | NODE_RANK=${NODE_RANK:-$node_rank}
47 | # let "NNODES=GPUS/GPUS_PER_NODE"
48 |
49 | NODE_NUM=${#worker_strs[@]}
50 | echo "node num is $NODE_NUM"
51 |
52 | if ((NODE_RANK == 0)); then
53 | for RUN in $(seq 100); do
54 | ls $OUTPUT_BASE | grep run$RUN && continue
55 | OUTPUT_DIR=$OUTPUT_BASE/run$RUN
56 | mkdir $OUTPUT_DIR && break
57 | done
58 |
59 | # clean up *.pyc files
60 | rmpyc() {
61 | rm -rf $(find -name __pycache__)
62 | rm -rf $(find -name "*.pyc")
63 | }
64 |
65 | # run backup
66 | echo "Backing up to log dir: $OUTPUT_DIR"
67 | rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR
68 | echo " ...Done"
69 |
70 | # tar src to avoid future editing
71 | cleanup() {
72 | echo "Packing source code"
73 | rmpyc
74 | # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
75 | echo " ...Done"
76 | }
77 |
78 | pushd $OUTPUT_DIR
79 | trap cleanup EXIT
80 |
81 | # log git status
82 | echo "Logging git status"
83 | git status > git_status
84 | git rev-parse HEAD > git_tag
85 | git diff > git_diff
86 | echo $PY_ARGS > desc
87 | echo " ...Done"
88 |
89 | else
90 | # 3 minutes
91 | sleep 180
92 | for RUN in $(seq 100); do
93 | ls $OUTPUT_BASE | grep run$RUN && continue
94 | let "ITERRUN=$RUN-1"
95 | OUTPUT_DIR=$OUTPUT_BASE/run$ITERRUN
96 | break
97 | done
98 | fi
99 |
100 | args=$(cat $1)
101 |
102 | # python -m torch.distributed.launch --nproc_per_node=8 --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR
103 |
104 | # python ./util/launch.py \
105 | # --nnodes 2 \
106 | # --node_rank ${NODE_RANK} \
107 | # --master_addr ${MASTER_ADDR} \
108 | # --master_port 29502 \
109 | # --nproc_per_node 8 \
110 | # python main.py "${args} --output_dir $OUTPUT_DIR"
111 | python -m torch.distributed.launch --nproc_per_node=8 --nnodes ${NODE_NUM} --node_rank ${NODE_RANK} --master_addr=${MASTER_ADDR} --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR
112 |
--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/build.ninja:
--------------------------------------------------------------------------------
1 | ninja_required_version = 1.3
2 | cxx = c++
3 | nvcc = /usr/local/cuda/bin/nvcc
4 |
5 | cflags = -pthread -B /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/software/anaconda3/envs/detr/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c
6 | post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
7 | cuda_cflags = -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c
8 | cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_70,code=sm_70 -std=c++14
9 | ldflags =
10 |
11 | rule compile
12 | command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
13 | depfile = $out.d
14 | deps = gcc
15 |
16 | rule cuda_compile
17 | command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
18 |
19 |
20 |
21 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp
22 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o: cuda_compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu
23 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/models/yolo_pafpn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | import torch
6 | import torch.nn as nn
7 |
8 | from .darknet import CSPDarknet
9 | from .network_blocks import BaseConv, CSPLayer, DWConv
10 |
11 |
12 | class YOLOPAFPN(nn.Module):
13 | """
14 | YOLOv3 model. Darknet 53 is the default backbone of this model.
15 | """
16 |
17 | def __init__(
18 | self,
19 | depth=1.0,
20 | width=1.0,
21 | in_features=("dark3", "dark4", "dark5"),
22 | in_channels=[256, 512, 1024],
23 | depthwise=False,
24 | act="silu",
25 | ):
26 | super().__init__()
27 | self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
28 | self.in_features = in_features
29 | self.in_channels = in_channels
30 | Conv = DWConv if depthwise else BaseConv
31 |
32 | self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
33 | self.lateral_conv0 = BaseConv(
34 | int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
35 | )
36 | self.C3_p4 = CSPLayer(
37 | int(2 * in_channels[1] * width),
38 | int(in_channels[1] * width),
39 | round(3 * depth),
40 | False,
41 | depthwise=depthwise,
42 | act=act,
43 | ) # cat
44 |
45 | self.reduce_conv1 = BaseConv(
46 | int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
47 | )
48 | self.C3_p3 = CSPLayer(
49 | int(2 * in_channels[0] * width),
50 | int(in_channels[0] * width),
51 | round(3 * depth),
52 | False,
53 | depthwise=depthwise,
54 | act=act,
55 | )
56 |
57 | # bottom-up conv
58 | self.bu_conv2 = Conv(
59 | int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
60 | )
61 | self.C3_n3 = CSPLayer(
62 | int(2 * in_channels[0] * width),
63 | int(in_channels[1] * width),
64 | round(3 * depth),
65 | False,
66 | depthwise=depthwise,
67 | act=act,
68 | )
69 |
70 | # bottom-up conv
71 | self.bu_conv1 = Conv(
72 | int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
73 | )
74 | self.C3_n4 = CSPLayer(
75 | int(2 * in_channels[1] * width),
76 | int(in_channels[2] * width),
77 | round(3 * depth),
78 | False,
79 | depthwise=depthwise,
80 | act=act,
81 | )
82 |
83 | def forward(self, input):
84 | """
85 | Args:
86 | inputs: input images.
87 |
88 | Returns:
89 | Tuple[Tensor]: FPN feature.
90 | """
91 |
92 | # backbone
93 | out_features = self.backbone(input)
94 | features = [out_features[f] for f in self.in_features]
95 | [x2, x1, x0] = features
96 |
97 | fpn_out0 = self.lateral_conv0(x0) # 1024->512/32
98 | f_out0 = self.upsample(fpn_out0) # 512/16
99 | f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16
100 | f_out0 = self.C3_p4(f_out0) # 1024->512/16
101 |
102 | fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16
103 | f_out1 = self.upsample(fpn_out1) # 256/8
104 | f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8
105 | pan_out2 = self.C3_p3(f_out1) # 512->256/8
106 |
107 | p_out1 = self.bu_conv2(pan_out2) # 256->256/16
108 | p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16
109 | pan_out1 = self.C3_n3(p_out1) # 512->512/16
110 |
111 | p_out0 = self.bu_conv1(pan_out1) # 512->512/32
112 | p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32
113 | pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
114 |
115 | outputs = (pan_out2, pan_out1, pan_out0)
116 | return outputs
117 |
--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | """
13 | Various positional encodings for the transformer.
14 | """
15 | import math
16 | import torch
17 | from torch import nn
18 |
19 | from util.misc import NestedTensor
20 |
21 |
22 | class PositionEmbeddingSine(nn.Module):
23 | """
24 | This is a more standard version of the position embedding, very similar to the one
25 | used by the Attention is all you need paper, generalized to work on images.
26 | """
27 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
28 | super().__init__()
29 | self.num_pos_feats = num_pos_feats
30 | self.temperature = temperature
31 | self.normalize = normalize
32 | if scale is not None and normalize is False:
33 | raise ValueError("normalize should be True if scale is passed")
34 | if scale is None:
35 | scale = 2 * math.pi
36 | self.scale = scale
37 |
38 | def forward(self, tensor_list: NestedTensor):
39 | x = tensor_list.tensors
40 | mask = tensor_list.mask
41 | assert mask is not None
42 | not_mask = ~mask
43 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
44 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
45 | if self.normalize:
46 | eps = 1e-6
47 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
48 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
49 |
50 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
51 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
52 |
53 | pos_x = x_embed[:, :, :, None] / dim_t
54 | pos_y = y_embed[:, :, :, None] / dim_t
55 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
56 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
57 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
58 | return pos
59 |
60 |
61 | class PositionEmbeddingLearned(nn.Module):
62 | """
63 | Absolute pos embedding, learned.
64 | """
65 | def __init__(self, num_pos_feats=256):
66 | super().__init__()
67 | self.row_embed = nn.Embedding(50, num_pos_feats)
68 | self.col_embed = nn.Embedding(50, num_pos_feats)
69 | self.reset_parameters()
70 |
71 | def reset_parameters(self):
72 | nn.init.uniform_(self.row_embed.weight)
73 | nn.init.uniform_(self.col_embed.weight)
74 |
75 | def forward(self, tensor_list: NestedTensor):
76 | x = tensor_list.tensors
77 | h, w = x.shape[-2:]
78 | i = torch.arange(w, device=x.device)
79 | j = torch.arange(h, device=x.device)
80 | x_emb = self.col_embed(i)
81 | y_emb = self.row_embed(j)
82 | pos = torch.cat([
83 | x_emb.unsqueeze(0).repeat(h, 1, 1),
84 | y_emb.unsqueeze(1).repeat(1, w, 1),
85 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
86 | return pos
87 |
88 |
89 | def build_position_encoding(args):
90 | N_steps = args.hidden_dim // 2
91 | if args.position_embedding in ('v2', 'sine'):
92 | # TODO find a better way of exposing other arguments
93 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
94 | elif args.position_embedding in ('v3', 'learned'):
95 | position_embedding = PositionEmbeddingLearned(N_steps)
96 | else:
97 | raise ValueError(f"not supported {args.position_embedding}")
98 |
99 | return position_embedding
100 |
--------------------------------------------------------------------------------
/models/dino/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 |
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 |
20 |
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 |
27 |
28 | torch.manual_seed(3)
29 |
30 |
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 | value = torch.rand(N, S, M, D).cuda() * 0.01
34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 | im2col_step = 2
38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 | fwdok = torch.allclose(output_cuda, output_pytorch)
41 | max_abs_err = (output_cuda - output_pytorch).abs().max()
42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 |
44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 |
46 |
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 | value = torch.rand(N, S, M, D).cuda() * 0.01
50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 | im2col_step = 2
54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 | max_abs_err = (output_cuda - output_pytorch).abs().max()
58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 |
60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 |
62 |
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 |
65 | value = torch.rand(N, S, M, channels).cuda() * 0.01
66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 | im2col_step = 2
70 | func = MSDeformAttnFunction.apply
71 |
72 | value.requires_grad = grad_value
73 | sampling_locations.requires_grad = grad_sampling_loc
74 | attention_weights.requires_grad = grad_attn_weight
75 |
76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 |
78 | print(f'* {gradok} check_gradient_numerical(D={channels})')
79 |
80 |
81 | if __name__ == '__main__':
82 | check_forward_equal_with_pytorch_double()
83 | check_forward_equal_with_pytorch_float()
84 |
85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 | check_gradient_numerical(channels, True, True, True)
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 |
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 |
20 |
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 |
27 |
28 | torch.manual_seed(3)
29 |
30 |
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 | value = torch.rand(N, S, M, D).cuda() * 0.01
34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 | im2col_step = 2
38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 | fwdok = torch.allclose(output_cuda, output_pytorch)
41 | max_abs_err = (output_cuda - output_pytorch).abs().max()
42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 |
44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 |
46 |
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 | value = torch.rand(N, S, M, D, requires_grad=True).cuda() * 0.01
50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2, requires_grad=True).cuda()
51 | attention_weights = torch.rand(N, Lq, M, L, P, requires_grad=True).cuda() + 1e-5
52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 | im2col_step = 2
54 |
55 | value = torch.autograd.Variable(value.data, requires_grad=True)
56 | sampling_locations = torch.autograd.Variable(sampling_locations.data, requires_grad=True)
57 | attention_weights = torch.autograd.Variable(attention_weights.data, requires_grad=True)
58 |
59 | t0 = time.time()
60 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).cpu()
61 | print( time.time()-t0)
62 | t0 = time.time()
63 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
64 | print( time.time()-t0)
65 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
66 | max_abs_err = (output_cuda - output_pytorch).abs().max()
67 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
68 |
69 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
70 |
71 |
72 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
73 |
74 | value = torch.rand(N, S, M, channels).cuda() * 0.01
75 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
76 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
77 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
78 | im2col_step = 2
79 | func = MSDeformAttnFunction.apply
80 |
81 | value.requires_grad = grad_value
82 | sampling_locations.requires_grad = grad_sampling_loc
83 | attention_weights.requires_grad = grad_attn_weight
84 |
85 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
86 |
87 | print(f'* {gradok} check_gradient_numerical(D={channels})')
88 |
89 |
90 | if __name__ == '__main__':
91 | check_forward_equal_with_pytorch_double()
92 | check_forward_equal_with_pytorch_float()
93 |
94 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
95 | check_gradient_numerical(channels, True, True, True)
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | import torch
13 | from functools import partial
14 | from models.structures import Instances
15 |
16 | def to_cuda(samples, targets, device):
17 | samples = samples.to(device, non_blocking=True)
18 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
19 | return samples, targets
20 |
21 |
22 | def tensor_to_cuda(tensor: torch.Tensor, device):
23 | return tensor.to(device)
24 |
25 |
26 | def is_tensor_or_instances(data):
27 | return isinstance(data, torch.Tensor) or isinstance(data, Instances)
28 |
29 |
30 | def data_apply(data, check_func, apply_func):
31 | if isinstance(data, dict):
32 | for k in data.keys():
33 | if check_func(data[k]):
34 | data[k] = apply_func(data[k])
35 | elif isinstance(data[k], dict) or isinstance(data[k], list):
36 | data_apply(data[k], check_func, apply_func)
37 | else:
38 | raise ValueError()
39 | elif isinstance(data, list):
40 | for i in range(len(data)):
41 | if check_func(data[i]):
42 | data[i] = apply_func(data[i])
43 | elif isinstance(data[i], dict) or isinstance(data[i], list):
44 | data_apply(data[i], check_func, apply_func)
45 | else:
46 | raise ValueError("invalid type {}".format(type(data[i])))
47 | else:
48 | raise ValueError("invalid type {}".format(type(data)))
49 | return data
50 |
51 |
52 | def data_dict_to_cuda(data_dict, device):
53 | return data_apply(data_dict, is_tensor_or_instances, partial(tensor_to_cuda, device=device))
54 |
55 |
56 | class data_prefetcher():
57 | def __init__(self, loader, device, prefetch=True):
58 | self.loader = iter(loader)
59 | self.prefetch = prefetch
60 | self.device = device
61 | if prefetch:
62 | self.stream = torch.cuda.Stream()
63 | self.preload()
64 |
65 | def preload(self):
66 | try:
67 | self.next_samples, self.next_targets = next(self.loader)
68 | except StopIteration:
69 | self.next_samples = None
70 | self.next_targets = None
71 | return
72 | # if record_stream() doesn't work, another option is to make sure device inputs are created
73 | # on the main stream.
74 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
75 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
76 | # Need to make sure the memory allocated for next_* is not still in use by the main stream
77 | # at the time we start copying to next_*:
78 | # self.stream.wait_stream(torch.cuda.current_stream())
79 | with torch.cuda.stream(self.stream):
80 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
81 | # more code for the alternative if record_stream() doesn't work:
82 | # copy_ will record the use of the pinned source tensor in this side stream.
83 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
84 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
85 | # self.next_input = self.next_input_gpu
86 | # self.next_target = self.next_target_gpu
87 |
88 | # With Amp, it isn't necessary to manually convert data to half.
89 | # if args.fp16:
90 | # self.next_input = self.next_input.half()
91 | # else:
92 |
93 | def next(self):
94 | if self.prefetch:
95 | torch.cuda.current_stream().wait_stream(self.stream)
96 | samples = self.next_samples
97 | targets = self.next_targets
98 | if samples is not None:
99 | samples.record_stream(torch.cuda.current_stream())
100 | if targets is not None:
101 | for t in targets:
102 | for k, v in t.items():
103 | v.record_stream(torch.cuda.current_stream())
104 | self.preload()
105 | else:
106 | try:
107 | samples, targets = next(self.loader)
108 | samples, targets = to_cuda(samples, targets, self.device)
109 | except StopIteration:
110 | print("catch_stop_iter")
111 | samples = None
112 | targets = None
113 |
114 | return samples, targets
115 |
--------------------------------------------------------------------------------
/util/tool.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 | import torch
12 | import copy
13 | import numpy as np
14 | import collections
15 |
16 | def load_model(model, model_path, optimizer=None, resume=False,
17 | lr=None, lr_step=None):
18 | start_epoch = 0
19 | checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
20 | print(f'loaded {model_path}')
21 | state_dict = checkpoint['model']
22 | model_state_dict = model.state_dict()
23 |
24 | # check loaded parameters and created model parameters
25 | msg = 'If you see this, your model does not fully load the ' + \
26 | 'pre-trained weight. Please make sure ' + \
27 | 'you set the correct --num_classes for your own dataset.'
28 | state_dict_old = copy.deepcopy(state_dict)
29 | for k in state_dict_old:
30 | if k in model_state_dict:
31 | if state_dict[k].shape != model_state_dict[k].shape:
32 | print('Skip loading parameter {}, required shape{}, ' \
33 | 'loaded shape{}. {}'.format(
34 | k, model_state_dict[k].shape, state_dict[k].shape, msg))
35 | if 'class_embed' in k:
36 | print("load class_embed: {} shape={}".format(k, state_dict[k].shape))
37 | if model_state_dict[k].shape[0] == 1:
38 | state_dict[k] = state_dict[k][1:2]
39 | elif model_state_dict[k].shape[0] == 2:
40 | state_dict[k] = state_dict[k][1:3]
41 | elif model_state_dict[k].shape[0] == 3:
42 | state_dict[k] = state_dict[k][1:4]
43 | elif model_state_dict[k].shape[0] == 11:
44 | state_dict[k] = state_dict[k][1:12]
45 | elif model_state_dict[k].shape[0] == 100:
46 | state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]]
47 | elif model_state_dict[k].shape[0] == 91 and state_dict[k].shape[0] == 1:
48 | state_dict[k] = state_dict[k].repeat_interleave(91, dim=0)
49 | elif model_state_dict[k].shape[0] == 2000:
50 | state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]]
51 | else:
52 | raise NotImplementedError('invalid shape: {}'.format(model_state_dict[k].shape))
53 | continue
54 | state_dict[k] = model_state_dict[k]
55 | elif k.replace('in_proj_weight', 'in_proj.weight') in model_state_dict:
56 | k_dst = k.replace('in_proj_weight', 'in_proj.weight')
57 | print('{}->{}'.format(k, k_dst))
58 | state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()])
59 | elif k.replace('in_proj_bias', 'in_proj.bias') in model_state_dict:
60 | k_dst = k.replace('in_proj_bias', 'in_proj.bias')
61 | print('{}->{}'.format(k, k_dst))
62 | state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()])
63 | elif 'transformer.decoder.layers' in k and 'self_attn.in_proj' in k:
64 | k_dst_q = k.replace('in_proj_', 'in_proj_q.')
65 | k_dst_k = k.replace('in_proj_', 'in_proj_k.')
66 | k_dst_v = k.replace('in_proj_', 'in_proj_v.')
67 | print('{}->({},{},{})'.format(k, k_dst_q, k_dst_k, k_dst_v))
68 | state_dict[k_dst_q], state_dict[k_dst_k], state_dict[k_dst_v] = torch.chunk(state_dict[k], 3, dim=0)
69 | else:
70 | print('Drop parameter {}.'.format(k) + msg)
71 | for k in model_state_dict:
72 | if not (k in state_dict): # pretrain model
73 | if 'decoder_two' in k:
74 | state_dict[k] = state_dict[k.replace('.decoder_two.', '.decoder.')]
75 | elif '_embed_two' in k:
76 | state_dict[k] = state_dict[k.replace('_embed_two.', '_embed.')]
77 | else:
78 | print('No param {}.'.format(k) + msg)
79 | state_dict[k] = model_state_dict[k]
80 | model.load_state_dict(state_dict, strict=False)
81 |
82 | # resume optimizer parameters
83 | if optimizer is not None and resume:
84 | if 'optimizer' in checkpoint:
85 | optimizer.load_state_dict(checkpoint['optimizer'])
86 | start_epoch = checkpoint['epoch']
87 | start_lr = lr
88 | for step in lr_step:
89 | if start_epoch >= step:
90 | start_lr *= 0.1
91 | for param_group in optimizer.param_groups:
92 | param_group['lr'] = start_lr
93 | print('Resumed optimizer with start lr', start_lr)
94 | else:
95 | print('No optimizer parameters in checkpoint.')
96 | if optimizer is not None:
97 | return model, optimizer, start_epoch
98 | else:
99 | return model
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/tools/similarity_analysis.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from collections import defaultdict
4 | from sklearn.decomposition import PCA
5 |
6 |
7 | # 计算两个box的IOU
8 | def bboxes_iou(bboxes1,bboxes2):
9 | bboxes1 = np.transpose(bboxes1)
10 | bboxes2 = np.transpose(bboxes2)
11 |
12 | # 计算两个box的交集:交集左上角的点取两个box的max,交集右下角的点取两个box的min
13 | int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0])
14 | int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1])
15 | int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2])
16 | int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3])
17 |
18 | # 计算两个box交集的wh:如果两个box没有交集,那么wh为0(按照计算方式wh为负数,跟0比较取最大值)
19 | int_h = np.maximum(int_ymax-int_ymin,0.)
20 | int_w = np.maximum(int_xmax-int_xmin,0.)
21 |
22 | # 计算IOU
23 | int_vol = int_h * int_w # 交集面积
24 | vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积
25 | vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积
26 | IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集
27 | return IOU
28 |
29 |
30 |
31 | root_data = 'tmp'
32 |
33 | # det2trk_weight = defaultdict(list)
34 | # trk2trk_weight = defaultdict(list)
35 | # detall2trk_weight = defaultdict(list)
36 | # for i in range(703):
37 | # print(i)
38 | # for j in range(6):
39 |
40 | # bboxes = np.load(os.path.join(root_data, 'box_%08d_%d.txt.npy'%(i,j)))[0]
41 | # classes = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,j)))[0, :, 0]
42 | # weights = np.load(os.path.join(root_data, 'weight_%08d_%d.txt.npy'%(i,j)))
43 |
44 | # bboxes[:, [0,1]] -= bboxes[:, [2,3]]/2
45 | # bboxes[:, [2,3]] += bboxes[:, [0,1]]
46 |
47 | # indexes = np.where(classes>0)[0]
48 |
49 | # det_indexes = indexes[indexes<60]
50 | # trk_indexes = indexes[indexes>=60]
51 |
52 | # iou = bboxes_iou(bboxes[trk_indexes], bboxes[det_indexes])
53 | # if len(trk_indexes) and len(det_indexes):
54 | # pair_idx = iou.argmax(-1)
55 | # pair_val = iou.max(-1)
56 | # pair_trk_idx = trk_indexes[pair_val>0.7]
57 | # pair_det_idx = det_indexes[pair_idx[pair_val>0.7]]
58 | # if len(pair_trk_idx) and len(pair_det_idx):
59 | # if weights[pair_trk_idx, pair_det_idx].mean() < 1:
60 | # det2trk_weight[j].append(weights[pair_trk_idx, pair_det_idx].mean())
61 | # else:
62 | # print("1")
63 | # if weights[pair_trk_idx, pair_trk_idx].mean() < 1:
64 | # trk2trk_weight[j].append(weights[pair_trk_idx, pair_trk_idx].mean())
65 | # else:
66 | # print("1")
67 | # if weights[pair_trk_idx, :60].sum(-1).mean() < 1:
68 | # detall2trk_weight[j].append(weights[pair_trk_idx, :60].sum(-1).mean())
69 | # else:
70 | # print("1")
71 |
72 | # print(np.array(det2trk_weight[0]).mean(), np.array(det2trk_weight[1]).mean(), np.array(det2trk_weight[2]).mean(), np.array(det2trk_weight[3]).mean(), np.array(det2trk_weight[4]).mean(), np.array(det2trk_weight[5]).mean())
73 | # print(np.array(trk2trk_weight[0]).mean(), np.array(trk2trk_weight[1]).mean(), np.array(trk2trk_weight[2]).mean(), np.array(trk2trk_weight[3]).mean(), np.array(trk2trk_weight[4]).mean(), np.array(trk2trk_weight[5]).mean())
74 | # print(np.array(detall2trk_weight[0]).mean(), np.array(detall2trk_weight[1]).mean(), np.array(detall2trk_weight[2]).mean(), np.array(detall2trk_weight[3]).mean(), np.array(detall2trk_weight[4]).mean(), np.array(detall2trk_weight[5]).mean())
75 |
76 | hs_all = defaultdict(list)
77 | hs_all_flatten = []
78 | for i in range(703):
79 | scores = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,5)))[0, :, 0]
80 | ids = np.load(os.path.join(root_data, 'ids_%08d.txt.npy'%(i)))[scores>0]
81 | hs = np.load(os.path.join(root_data, 'hs_%08d.txt.npy'%(i)))[scores>0]
82 |
83 | for id, h in zip(ids, hs):
84 | hs_all[id].append(h)
85 | hs_all_flatten.append(h)
86 |
87 | pca = PCA(n_components=2)
88 | # newX = pca.fit_transform(X)
89 | pca.fit(hs_all_flatten)
90 | pca.transform(X)
91 |
92 |
93 |
94 |
95 |
96 | stat_scores_det = defaultdict(lambda: defaultdict(int))
97 | for line in np.loadtxt('tmp_det.txt'):
98 | stat_scores_det[int(line[0])][int(line[1])] = line[2]
99 | stat_scores_trk = defaultdict(lambda: defaultdict(int))
100 | for line in np.loadtxt('tmp_trk.txt'):
101 | stat_scores_trk[int(line[0])][int(line[1])] = line[2]
102 | stat_scores_uni_det = defaultdict(lambda: defaultdict(int))
103 | for line in np.loadtxt('tmp_uni_det.txt'):
104 | stat_scores_uni_det[int(line[0])][int(line[1])] = line[2]
105 | stat_scores_uni_trk = defaultdict(lambda: defaultdict(int))
106 | for line in np.loadtxt('tmp_uni_trk.txt'):
107 | stat_scores_uni_trk[int(line[0])][int(line[1])] = line[2]
108 |
109 |
110 | count_bin_all = defaultdict(list)
111 | count_bin = defaultdict(int)
112 | for framid in stat_scores_trk:
113 | for obj_id in stat_scores_trk[framid]:
114 | if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]:
115 | count_bin_all[int(stat_scores_trk[framid][obj_id]*10)].append(stat_scores_uni_trk[framid][obj_id]-stat_scores_trk[framid][obj_id])
116 | if stat_scores_trk[framid][obj_id] > stat_scores_uni_trk[framid][obj_id]:
117 | count_bin[int(stat_scores_trk[framid][obj_id]*10)] -= 1
118 | else:
119 | count_bin[int(stat_scores_trk[framid][obj_id]*10)] += 1
120 | for i in range(10):
121 | print(np.array(count_bin_all[i]).mean(), np.array(count_bin_all[i]).std())
122 |
123 |
124 | with open('tmp.txt', 'w') as fp:
125 | for framid in stat_scores_trk:
126 | for obj_id in stat_scores_trk[framid]:
127 | if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]:
128 | # print(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id])
129 | fp.write('%f %f\n'%(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id]))
130 |
--------------------------------------------------------------------------------
/tools/visualize_tao.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 |
6 | from collections import defaultdict
7 | from glob import glob
8 | import json
9 | import os
10 | import cv2
11 | import numpy as np
12 | import subprocess
13 | import random
14 | from tqdm import tqdm
15 | from PIL import Image, ImageDraw
16 |
17 | from scipy.optimize import linear_sum_assignment as linear_assignment
18 |
19 | # 计算两个box的IOU
20 | def bboxes_iou(bboxes1,bboxes2):
21 | bboxes1 = np.transpose(bboxes1)
22 | bboxes2 = np.transpose(bboxes2)
23 |
24 | # 计算两个box的交集:交集左上角的点取两个box的max,交集右下角的点取两个box的min
25 | int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0])
26 | int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1])
27 | int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2])
28 | int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3])
29 |
30 | # 计算两个box交集的wh:如果两个box没有交集,那么wh为0(按照计算方式wh为负数,跟0比较取最大值)
31 | int_h = np.maximum(int_ymax-int_ymin,0.)
32 | int_w = np.maximum(int_xmax-int_xmin,0.)
33 |
34 | # 计算IOU
35 | int_vol = int_h * int_w # 交集面积
36 | vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积
37 | vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积
38 | IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集
39 | return IOU
40 |
41 | def get_color(i):
42 | return [(i * 23 * j + 43) % 255 for j in range(3)]
43 |
44 |
45 | def show_gt(img_list, output="output.mp4"):
46 | h, w, _ = cv2.imread(img_list[0]).shape
47 | command = [
48 | "anaconda3/envs/detrex/bin/ffmpeg",
49 | '-y', # overwrite output file if it exists
50 | '-f', 'rawvideo',
51 | '-vcodec','rawvideo',
52 | '-s', f'{w}x{h}', # size of one frame
53 | '-pix_fmt', 'bgr24',
54 | '-r', '20', # frames per second
55 | '-i', '-', # The imput comes from a pipe
56 | '-s', f'{w//2*2}x{h//2*2}',
57 | '-an', # Tells FFMPEG not to expect any audio
58 | '-loglevel', 'error',
59 | # '-crf', '26',
60 | '-b:v', '0',
61 | '-pix_fmt', 'yuv420p'
62 | ]
63 | # writing_process = subprocess.Popen(command + [output], stdin=subprocess.PIPE)
64 | fps = 16
65 | size = (w,h)
66 | videowriter = cv2.VideoWriter(output,cv2.VideoWriter_fourcc('M','J','P','G'), fps, size)
67 |
68 |
69 | for i, path in enumerate(tqdm(sorted(img_list))):
70 | im = cv2.imread(path)
71 | det_bboxes = []
72 | motr_bboxes = []
73 | for det in det_db[path.replace('data/', '').replace('.jpg', '.txt').replace('dancetrack/', 'DanceTrack/')]:
74 | x1, y1, w, h, s = map(float, det.strip().split(','))
75 | x1, y1, w, h = map(int, [x1, y1, w, h])
76 | im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2)
77 | im = cv2.putText(im, '%0.2f'%s, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1)
78 | det_bboxes.append([x1, y1, x1+w, y1+h])
79 |
80 | det_bboxes = np.array(det_bboxes)
81 | motr_bboxes = np.array(motr_bboxes)
82 | ious = bboxes_iou(det_bboxes, motr_bboxes)
83 | matching = linear_assignment(-ious)
84 | matched = sum(ious[matching[0], matching[1]] > 0.5)
85 | im = cv2.putText(im, f"{matched}/{len(det_bboxes)}/{len(motr_bboxes)}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, get_color(j), 3)
86 | cv2.putText(im, "{}".format(os.path.basename(path)[:-4]), (120,120), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 6)
87 | # writing_process.stdin.write(im.tobytes())
88 | videowriter.write(im)
89 |
90 | videowriter.release()
91 |
92 |
93 | if __name__ == '__main__':
94 |
95 | labels_full = defaultdict(lambda : defaultdict(list))
96 | imgid2name = defaultdict()
97 | def _add_mot_folder(mot_path, split_dir):
98 | print("Adding", split_dir)
99 | labels = json.load(open(os.path.join(mot_path, split_dir)))
100 | for ann in labels['images']:
101 | imgid2name[ann['id']] = ann['file_name']
102 | for ann in labels['annotations']:
103 | vid = ann['video_id']
104 | t = ann['image_id']
105 | x, y, w, h = ann['bbox']
106 | i = ann['track_id']
107 | crowd = ann['iscrowd']
108 | cl = ann['category_id']
109 | labels_full[vid][t].append([x, y, w, h, i, crowd, cl])
110 | return labels_full, imgid2name
111 |
112 | mot_path = 'data/'
113 | labels_full, imgid2name = _add_mot_folder(mot_path, 'tao/annotations/train.json')
114 | indices = []
115 | vid_files = list(labels_full.keys())
116 | for vid in vid_files:
117 | t_min = min(labels_full[vid].keys())
118 | t_max = max(labels_full[vid].keys()) + 1
119 | for t in range(t_min, t_max):
120 | indices.append((vid, t))
121 |
122 | vid_old = None
123 | random.shuffle(vid_files)
124 | videowriter = None
125 | for vid in vid_files:
126 | print(vid)
127 | t_min = min(labels_full[vid].keys())
128 | t_max = max(labels_full[vid].keys()) + 1
129 | for idx in range(t_min, t_max):
130 | # vid, idx = indices[idx]
131 | img_path = os.path.join(mot_path, 'tao/frames', imgid2name[idx])
132 | img = Image.open(img_path)
133 | if vid != vid_old:
134 | vid_old = vid
135 | w, h = img._size
136 | fps = 1
137 | size = (w,h)
138 | if videowriter is not None:
139 | videowriter.release()
140 | videowriter = cv2.VideoWriter('tmp/'+imgid2name[idx].split('/')[-2]+'.avi',cv2.VideoWriter_fourcc('M','J','P','G'), fps, size)
141 | im = np.array(img)
142 | for *xywh, id, crowd, cl in labels_full[vid][idx]:
143 | x1, y1, w, h = xywh
144 | x1, y1, w, h = map(int, [x1, y1, w, h])
145 | im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2)
146 | im = cv2.putText(im, '%d'%id, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1)
147 | videowriter.write(im)
148 |
149 | videowriter.release()
150 |
--------------------------------------------------------------------------------
/models/memory_bank.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 |
5 | import torch
6 | import torch.nn.functional as F
7 | from torch import nn, Tensor
8 |
9 | from typing import List
10 |
11 | from models.structures import Instances
12 |
13 |
14 | class MemoryBank(nn.Module):
15 | def __init__(self, args, dim_in, hidden_dim, dim_out):
16 | super().__init__()
17 | self._build_layers(args, dim_in, hidden_dim, dim_out)
18 | for p in self.parameters():
19 | if p.dim() > 1:
20 | nn.init.xavier_uniform_(p)
21 |
22 | def _build_layers(self, args, dim_in, hidden_dim, dim_out):
23 | self.save_thresh = args.memory_bank_score_thresh
24 | self.save_period = 3
25 | self.max_his_length = args.memory_bank_len
26 |
27 | self.save_proj = nn.Linear(dim_in, dim_in)
28 |
29 | self.temporal_attn = nn.MultiheadAttention(dim_in, 8, dropout=0)
30 | self.temporal_fc1 = nn.Linear(dim_in, hidden_dim)
31 | self.temporal_fc2 = nn.Linear(hidden_dim, dim_in)
32 | self.temporal_norm1 = nn.LayerNorm(dim_in)
33 | self.temporal_norm2 = nn.LayerNorm(dim_in)
34 |
35 | self.track_cls = nn.Linear(dim_in, 1)
36 |
37 | self.self_attn = None
38 | if args.memory_bank_with_self_attn:
39 | self.spatial_attn = nn.MultiheadAttention(dim_in, 8, dropout=0)
40 | self.spatial_fc1 = nn.Linear(dim_in, hidden_dim)
41 | self.spatial_fc2 = nn.Linear(hidden_dim, dim_in)
42 | self.spatial_norm1 = nn.LayerNorm(dim_in)
43 | self.spatial_norm2 = nn.LayerNorm(dim_in)
44 | else:
45 | self.spatial_attn = None
46 |
47 | def update(self, track_instances):
48 | embed = track_instances.output_embedding[:, None] #( N, 1, 256)
49 | scores = track_instances.scores
50 | mem_padding_mask = track_instances.mem_padding_mask
51 | device = embed.device
52 |
53 | save_period = track_instances.save_period
54 | if self.training:
55 | saved_idxes = scores > 0
56 | else:
57 | saved_idxes = (save_period == 0) & (scores > self.save_thresh)
58 | # saved_idxes = (save_period == 0)
59 | save_period[save_period > 0] -= 1
60 | save_period[saved_idxes] = self.save_period
61 |
62 | saved_embed = embed[saved_idxes]
63 | if len(saved_embed) > 0:
64 | prev_embed = track_instances.mem_bank[saved_idxes]
65 | save_embed = self.save_proj(saved_embed)
66 | mem_padding_mask[saved_idxes] = torch.cat([mem_padding_mask[saved_idxes, 1:], torch.zeros((len(saved_embed), 1), dtype=torch.bool, device=device)], dim=1)
67 | track_instances.mem_bank = track_instances.mem_bank.clone()
68 | track_instances.mem_bank[saved_idxes] = torch.cat([prev_embed[:, 1:], save_embed], dim=1)
69 |
70 | def _forward_spatial_attn(self, track_instances):
71 | if len(track_instances) == 0:
72 | return track_instances
73 |
74 | embed = track_instances.output_embedding
75 | dim = embed.shape[-1]
76 | query_pos = track_instances.query_pos[:, :dim] # 应该为query_pos = pos2posemb(track_instances.ref_pts)
77 | k = q = (embed + query_pos)
78 | v = embed
79 | embed2 = self.spatial_attn(
80 | q[:, None],
81 | k[:, None],
82 | v[:, None]
83 | )[0][:, 0]
84 | embed = self.spatial_norm1(embed + embed2)
85 | embed2 = self.spatial_fc2(F.relu(self.spatial_fc1(embed)))
86 | embed = self.spatial_norm2(embed + embed2)
87 | track_instances.output_embedding = embed
88 | return track_instances
89 |
90 | def _forward_track_cls(self, track_instances):
91 | track_instances.track_scores = self.track_cls(track_instances.output_embedding)[..., 0]
92 | return track_instances
93 |
94 | def _forward_temporal_attn(self, track_instances):
95 | if len(track_instances) == 0:
96 | return track_instances
97 |
98 | dim = track_instances.query_pos.shape[1]
99 | key_padding_mask = track_instances.mem_padding_mask
100 |
101 | valid_idxes = key_padding_mask[:, -1] == 0
102 | embed = track_instances.output_embedding[valid_idxes] # (n, 256)
103 |
104 | if len(embed) > 0:
105 | prev_embed = track_instances.mem_bank[valid_idxes]
106 | key_padding_mask = key_padding_mask[valid_idxes]
107 | embed2 = self.temporal_attn(
108 | embed[None], # (num_track, dim) to (1, num_track, dim)
109 | prev_embed.transpose(0, 1), # (num_track, mem_len, dim) to (mem_len, num_track, dim)
110 | prev_embed.transpose(0, 1),
111 | key_padding_mask=key_padding_mask,
112 | )[0][0]
113 |
114 | embed = self.temporal_norm1(embed + embed2)
115 | embed2 = self.temporal_fc2(F.relu(self.temporal_fc1(embed)))
116 | embed = self.temporal_norm2(embed + embed2)
117 | track_instances.output_embedding = track_instances.output_embedding.clone()
118 | track_instances.output_embedding[valid_idxes] = embed
119 |
120 | return track_instances
121 |
122 | def forward_temporal_attn(self, track_instances):
123 | return self._forward_temporal_attn(track_instances)
124 |
125 | def forward(self, track_instances: Instances, update_bank=True) -> Instances:
126 | track_instances = self._forward_temporal_attn(track_instances)
127 | if update_bank:
128 | self.update(track_instances)
129 | if self.spatial_attn is not None:
130 | track_instances = self._forward_spatial_attn(track_instances)
131 | if self.track_cls is not None:
132 | track_instances = self._forward_track_cls(track_instances)
133 | return track_instances
134 |
135 |
136 | def build_memory_bank(args, dim_in, hidden_dim, dim_out):
137 | name = args.memory_bank_type
138 | memory_banks = {
139 | 'MemoryBank': MemoryBank,
140 | }
141 | assert name in memory_banks
142 | return memory_banks[name](args, dim_in, hidden_dim, dim_out)
143 |
--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | import os
13 | import math
14 | import torch
15 | import torch.distributed as dist
16 | from torch.utils.data.sampler import Sampler
17 |
18 |
19 | class DistributedSampler(Sampler):
20 | """Sampler that restricts data loading to a subset of the dataset.
21 | It is especially useful in conjunction with
22 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
23 | process can pass a DistributedSampler instance as a DataLoader sampler,
24 | and load a subset of the original dataset that is exclusive to it.
25 | .. note::
26 | Dataset is assumed to be of constant size.
27 | Arguments:
28 | dataset: Dataset used for sampling.
29 | num_replicas (optional): Number of processes participating in
30 | distributed training.
31 | rank (optional): Rank of the current process within num_replicas.
32 | """
33 |
34 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
35 | if num_replicas is None:
36 | if not dist.is_available():
37 | raise RuntimeError("Requires distributed package to be available")
38 | num_replicas = dist.get_world_size()
39 | if rank is None:
40 | if not dist.is_available():
41 | raise RuntimeError("Requires distributed package to be available")
42 | rank = dist.get_rank()
43 | self.dataset = dataset
44 | self.num_replicas = num_replicas
45 | self.rank = rank
46 | self.epoch = 0
47 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
48 | self.total_size = self.num_samples * self.num_replicas
49 | self.shuffle = shuffle
50 |
51 | def __iter__(self):
52 | if self.shuffle:
53 | # deterministically shuffle based on epoch
54 | g = torch.Generator()
55 | g.manual_seed(self.epoch)
56 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
57 | else:
58 | indices = torch.arange(len(self.dataset)).tolist()
59 |
60 | # add extra samples to make it evenly divisible
61 | if len(indices) * 2 < self.total_size:
62 | tmp = indices * self.total_size
63 | indices += tmp[: (self.total_size - len(indices))]
64 | else:
65 | indices += indices[: (self.total_size - len(indices))]
66 | assert len(indices) == self.total_size
67 |
68 | # subsample
69 | offset = self.num_samples * self.rank
70 | indices = indices[offset : offset + self.num_samples]
71 | assert len(indices) == self.num_samples
72 |
73 | return iter(indices)
74 |
75 | def __len__(self):
76 | return self.num_samples
77 |
78 | def set_epoch(self, epoch):
79 | self.epoch = epoch
80 |
81 |
82 | class NodeDistributedSampler(Sampler):
83 | """Sampler that restricts data loading to a subset of the dataset.
84 | It is especially useful in conjunction with
85 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
86 | process can pass a DistributedSampler instance as a DataLoader sampler,
87 | and load a subset of the original dataset that is exclusive to it.
88 | .. note::
89 | Dataset is assumed to be of constant size.
90 | Arguments:
91 | dataset: Dataset used for sampling.
92 | num_replicas (optional): Number of processes participating in
93 | distributed training.
94 | rank (optional): Rank of the current process within num_replicas.
95 | """
96 |
97 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
98 | if num_replicas is None:
99 | if not dist.is_available():
100 | raise RuntimeError("Requires distributed package to be available")
101 | num_replicas = dist.get_world_size()
102 | if rank is None:
103 | if not dist.is_available():
104 | raise RuntimeError("Requires distributed package to be available")
105 | rank = dist.get_rank()
106 | if local_rank is None:
107 | local_rank = int(os.environ.get('LOCAL_RANK', 0))
108 | if local_size is None:
109 | local_size = int(os.environ.get('LOCAL_SIZE', 1))
110 | self.dataset = dataset
111 | self.shuffle = shuffle
112 | self.num_replicas = num_replicas
113 | self.num_parts = local_size
114 | self.rank = rank
115 | self.local_rank = local_rank
116 | self.epoch = 0
117 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
118 | self.total_size = self.num_samples * self.num_replicas
119 |
120 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
121 |
122 | def __iter__(self):
123 | if self.shuffle:
124 | # deterministically shuffle based on epoch
125 | g = torch.Generator()
126 | g.manual_seed(self.epoch)
127 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
128 | else:
129 | indices = torch.arange(len(self.dataset)).tolist()
130 | indices = [i for i in indices if i % self.num_parts == self.local_rank]
131 |
132 | # add extra samples to make it evenly divisible
133 | indices += indices[:(self.total_size_parts - len(indices))]
134 | assert len(indices) == self.total_size_parts
135 |
136 | # subsample
137 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
138 | assert len(indices) == self.num_samples
139 |
140 | return iter(indices)
141 |
142 | def __len__(self):
143 | return self.num_samples
144 |
145 | def set_epoch(self, epoch):
146 | self.epoch = epoch
147 |
--------------------------------------------------------------------------------
/models/yolox.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | import torch
6 | from torch import nn, Tensor
7 | from typing import List
8 |
9 | from .yolo_head import YOLOXHead
10 | from .yolo_pafpn import YOLOPAFPN
11 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
12 | accuracy, get_world_size, interpolate, get_rank,
13 | is_dist_avail_and_initialized, inverse_sigmoid)
14 |
15 |
16 | def _max_by_axis(the_list):
17 | # type: (List[List[int]]) -> List[int]
18 | maxes = the_list[0]
19 | for sublist in the_list[1:]:
20 | for index, item in enumerate(sublist):
21 | maxes[index] = max(maxes[index], item)
22 | return maxes
23 |
24 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisibility: int = 0):
25 | # TODO make this more general
26 | if tensor_list[0].ndim == 3:
27 | # TODO make it support different-sized images
28 |
29 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
30 | if size_divisibility > 0:
31 | stride = size_divisibility
32 | # the last two dims are H,W, both subject to divisibility requirement
33 | max_size[-1] = (max_size[-1] + (stride - 1)) // stride * stride
34 | max_size[-2] = (max_size[-2] + (stride - 1)) // stride * stride
35 |
36 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
37 | batch_shape = [len(tensor_list)] + max_size
38 | b, c, h, w = batch_shape
39 | dtype = tensor_list[0].dtype
40 | device = tensor_list[0].device
41 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
42 | for img, pad_img in zip(tensor_list, tensor):
43 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
44 | else:
45 | raise ValueError('not supported')
46 | return tensor
47 |
48 |
49 |
50 | class YOLOX(nn.Module):
51 | """
52 | YOLOX model module. The module list is defined by create_yolov3_modules function.
53 | The network returns loss values from three YOLO layers during training
54 | and detection results during test.
55 | """
56 |
57 | def __init__(self, backbone=None, head=None):
58 | super().__init__()
59 | if backbone is None:
60 | backbone = YOLOPAFPN()
61 | if head is None:
62 | head = YOLOXHead(80)
63 |
64 | self.backbone = backbone
65 | self.head = head
66 |
67 | def forward(self, x, targets=None):
68 | # fpn output content features of [dark3, dark4, dark5]
69 | fpn_outs = self.backbone(x)
70 |
71 | if self.training:
72 | assert targets is not None
73 | loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
74 | fpn_outs, targets, x
75 | )
76 | outputs = {
77 | "total_loss": loss,
78 | "iou_loss": iou_loss,
79 | "l1_loss": l1_loss,
80 | "conf_loss": conf_loss,
81 | "cls_loss": cls_loss,
82 | "num_fg": num_fg,
83 | }
84 | else:
85 | outputs = self.head(fpn_outs)
86 |
87 | return outputs
88 |
89 | @torch.no_grad()
90 | def inference_single_image(self, img, ori_img_size, track_instances=None):
91 | if not isinstance(img, NestedTensor):
92 | img = nested_tensor_from_tensor_list(img, size_divisibility=32)
93 | output = self.forward(img)
94 |
95 | out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
96 |
97 | # _, _, img_h, img_w = img.shape
98 | # scale = max(ori_img_size[0]/img_h, ori_img_size[1]/img_w)
99 | # output[..., :4] *= scale
100 | # output = output[output[..., 4]>0.5]
101 |
102 | # import cv2
103 | # res[..., [0,1]] -= res[..., [2,3]]/2
104 | # res[..., [2,3]] += res[..., [0,1]]
105 | # ori_img = ori_img.cpu().numpy()
106 | # for o in res.cpu().numpy():
107 | # cv2.rectangle(ori_img, pt1 = (int(o[0]), int(0[1])), pt2 =(int(o[2]), int(0[3])), color = (0, 0, 255), thickness = 2)
108 | # cv2.imwrite('tmp.png', ori_img)
109 | return output
110 |
111 |
112 | class PostProcess(nn.Module):
113 | """ This module converts the model's output into the format expected by the coco api"""
114 |
115 | @torch.no_grad()
116 | def forward(self, outputs, target_sizes):
117 | """ Perform the computation
118 | Parameters:
119 | outputs: raw outputs of the model
120 | target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
121 | For evaluation, this must be the original image size (before any data augmentation)
122 | For visualization, this should be the image size after data augment, but before padding
123 | """
124 | out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
125 |
126 | assert len(out_logits) == len(target_sizes)
127 | assert target_sizes.shape[1] == 2
128 |
129 | prob = out_logits.sigmoid()
130 | topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
131 | scores = topk_values
132 | topk_boxes = topk_indexes // out_logits.shape[2]
133 | labels = topk_indexes % out_logits.shape[2]
134 | boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
135 | boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
136 |
137 | # and from relative [0, 1] to absolute [0, height] coordinates
138 | img_h, img_w = target_sizes.unbind(1)
139 | scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
140 | boxes = boxes * scale_fct[:, None, :]
141 |
142 | results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
143 |
144 | return results
145 |
146 |
147 | def build(args):
148 |
149 | def init_yolo(M):
150 | for m in M.modules():
151 | if isinstance(m, nn.BatchNorm2d):
152 | m.eps = 1e-3
153 | m.momentum = 0.03
154 |
155 | in_channels = [256, 512, 1024]
156 | depth = 1.33
157 | width = 1.25
158 | num_classes = 1
159 | backbone = YOLOPAFPN(depth, width, in_channels=in_channels)
160 | head = YOLOXHead(num_classes, width, in_channels=in_channels)
161 | model = YOLOX(backbone, head)
162 |
163 | model.apply(init_yolo)
164 | model.head.initialize_biases(1e-2)
165 |
166 | return model, None, None
--------------------------------------------------------------------------------
/models/dino/position_encoding.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # DINO
3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Conditional DETR
7 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9 | # ------------------------------------------------------------------------
10 | # Copied from DETR (https://github.com/facebookresearch/detr)
11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
12 | # ------------------------------------------------------------------------
13 |
14 | """
15 | Various positional encodings for the transformer.
16 | """
17 | import math
18 | import torch
19 | from torch import nn
20 |
21 | from util.misc import NestedTensor
22 |
23 |
24 | class PositionEmbeddingSine(nn.Module):
25 | """
26 | This is a more standard version of the position embedding, very similar to the one
27 | used by the Attention is all you need paper, generalized to work on images.
28 | """
29 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
30 | super().__init__()
31 | self.num_pos_feats = num_pos_feats
32 | self.temperature = temperature
33 | self.normalize = normalize
34 | if scale is not None and normalize is False:
35 | raise ValueError("normalize should be True if scale is passed")
36 | if scale is None:
37 | scale = 2 * math.pi
38 | self.scale = scale
39 |
40 | def forward(self, tensor_list: NestedTensor):
41 | x = tensor_list.tensors
42 | mask = tensor_list.mask
43 | assert mask is not None
44 | not_mask = ~mask
45 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
46 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
47 | if self.normalize:
48 | eps = 1e-6
49 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
50 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
51 |
52 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
53 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
54 |
55 | pos_x = x_embed[:, :, :, None] / dim_t
56 | pos_y = y_embed[:, :, :, None] / dim_t
57 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
58 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
59 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
60 | return pos
61 |
62 | class PositionEmbeddingSineHW(nn.Module):
63 | """
64 | This is a more standard version of the position embedding, very similar to the one
65 | used by the Attention is all you need paper, generalized to work on images.
66 | """
67 | def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
68 | super().__init__()
69 | self.num_pos_feats = num_pos_feats
70 | self.temperatureH = temperatureH
71 | self.temperatureW = temperatureW
72 | self.normalize = normalize
73 | if scale is not None and normalize is False:
74 | raise ValueError("normalize should be True if scale is passed")
75 | if scale is None:
76 | scale = 2 * math.pi
77 | self.scale = scale
78 |
79 | def forward(self, tensor_list: NestedTensor):
80 | x = tensor_list.tensors
81 | mask = tensor_list.mask
82 | assert mask is not None
83 | not_mask = ~mask
84 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
85 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
86 |
87 |
88 |
89 | if self.normalize:
90 | eps = 1e-6
91 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
92 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
93 |
94 | dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
95 | dim_tx = self.temperatureW ** (2 * torch.div(dim_tx, 2, rounding_mode="floor") / self.num_pos_feats)
96 | pos_x = x_embed[:, :, :, None] / dim_tx
97 |
98 | dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
99 | dim_ty = self.temperatureH ** (2 * torch.div(dim_ty, 2, rounding_mode="floor") / self.num_pos_feats)
100 | pos_y = y_embed[:, :, :, None] / dim_ty
101 |
102 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
103 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
104 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
105 |
106 |
107 |
108 | return pos
109 |
110 | class PositionEmbeddingLearned(nn.Module):
111 | """
112 | Absolute pos embedding, learned.
113 | """
114 | def __init__(self, num_pos_feats=256):
115 | super().__init__()
116 | self.row_embed = nn.Embedding(50, num_pos_feats)
117 | self.col_embed = nn.Embedding(50, num_pos_feats)
118 | self.reset_parameters()
119 |
120 | def reset_parameters(self):
121 | nn.init.uniform_(self.row_embed.weight)
122 | nn.init.uniform_(self.col_embed.weight)
123 |
124 | def forward(self, tensor_list: NestedTensor):
125 | x = tensor_list.tensors
126 | h, w = x.shape[-2:]
127 | i = torch.arange(w, device=x.device)
128 | j = torch.arange(h, device=x.device)
129 | x_emb = self.col_embed(i)
130 | y_emb = self.row_embed(j)
131 | pos = torch.cat([
132 | x_emb.unsqueeze(0).repeat(h, 1, 1),
133 | y_emb.unsqueeze(1).repeat(1, w, 1),
134 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
135 | return pos
136 |
137 |
138 | def build_position_encoding(args):
139 | N_steps = args.hidden_dim // 2
140 | if args.position_embedding in ('v2', 'sine'):
141 | # TODO find a better way of exposing other arguments
142 | position_embedding = PositionEmbeddingSineHW(
143 | N_steps,
144 | temperatureH=args.pe_temperatureH,
145 | temperatureW=args.pe_temperatureW,
146 | normalize=True
147 | )
148 | elif args.position_embedding in ('v3', 'learned'):
149 | position_embedding = PositionEmbeddingLearned(N_steps)
150 | else:
151 | raise ValueError(f"not supported {args.position_embedding}")
152 |
153 | return position_embedding
154 |
--------------------------------------------------------------------------------
/models/darknet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | from torch import nn
6 |
7 | from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
8 |
9 |
10 | class Darknet(nn.Module):
11 | # number of blocks from dark2 to dark5.
12 | depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
13 |
14 | def __init__(
15 | self,
16 | depth,
17 | in_channels=3,
18 | stem_out_channels=32,
19 | out_features=("dark3", "dark4", "dark5"),
20 | ):
21 | """
22 | Args:
23 | depth (int): depth of darknet used in model, usually use [21, 53] for this param.
24 | in_channels (int): number of input channels, for example, use 3 for RGB image.
25 | stem_out_channels (int): number of output chanels of darknet stem.
26 | It decides channels of darknet layer2 to layer5.
27 | out_features (Tuple[str]): desired output layer name.
28 | """
29 | super().__init__()
30 | assert out_features, "please provide output features of Darknet"
31 | self.out_features = out_features
32 | self.stem = nn.Sequential(
33 | BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
34 | *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
35 | )
36 | in_channels = stem_out_channels * 2 # 64
37 |
38 | num_blocks = Darknet.depth2blocks[depth]
39 | # create darknet with `stem_out_channels` and `num_blocks` layers.
40 | # to make model structure more clear, we don't use `for` statement in python.
41 | self.dark2 = nn.Sequential(
42 | *self.make_group_layer(in_channels, num_blocks[0], stride=2)
43 | )
44 | in_channels *= 2 # 128
45 | self.dark3 = nn.Sequential(
46 | *self.make_group_layer(in_channels, num_blocks[1], stride=2)
47 | )
48 | in_channels *= 2 # 256
49 | self.dark4 = nn.Sequential(
50 | *self.make_group_layer(in_channels, num_blocks[2], stride=2)
51 | )
52 | in_channels *= 2 # 512
53 |
54 | self.dark5 = nn.Sequential(
55 | *self.make_group_layer(in_channels, num_blocks[3], stride=2),
56 | *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
57 | )
58 |
59 | def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
60 | "starts with conv layer then has `num_blocks` `ResLayer`"
61 | return [
62 | BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
63 | *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
64 | ]
65 |
66 | def make_spp_block(self, filters_list, in_filters):
67 | m = nn.Sequential(
68 | *[
69 | BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
70 | BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
71 | SPPBottleneck(
72 | in_channels=filters_list[1],
73 | out_channels=filters_list[0],
74 | activation="lrelu",
75 | ),
76 | BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
77 | BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
78 | ]
79 | )
80 | return m
81 |
82 | def forward(self, x):
83 | outputs = {}
84 | x = self.stem(x)
85 | outputs["stem"] = x
86 | x = self.dark2(x)
87 | outputs["dark2"] = x
88 | x = self.dark3(x)
89 | outputs["dark3"] = x
90 | x = self.dark4(x)
91 | outputs["dark4"] = x
92 | x = self.dark5(x)
93 | outputs["dark5"] = x
94 | return {k: v for k, v in outputs.items() if k in self.out_features}
95 |
96 |
97 | class CSPDarknet(nn.Module):
98 | def __init__(
99 | self,
100 | dep_mul,
101 | wid_mul,
102 | out_features=("dark3", "dark4", "dark5"),
103 | depthwise=False,
104 | act="silu",
105 | ):
106 | super().__init__()
107 | assert out_features, "please provide output features of Darknet"
108 | self.out_features = out_features
109 | Conv = DWConv if depthwise else BaseConv
110 |
111 | base_channels = int(wid_mul * 64) # 64
112 | base_depth = max(round(dep_mul * 3), 1) # 3
113 |
114 | # stem
115 | self.stem = Focus(3, base_channels, ksize=3, act=act)
116 |
117 | # dark2
118 | self.dark2 = nn.Sequential(
119 | Conv(base_channels, base_channels * 2, 3, 2, act=act),
120 | CSPLayer(
121 | base_channels * 2,
122 | base_channels * 2,
123 | n=base_depth,
124 | depthwise=depthwise,
125 | act=act,
126 | ),
127 | )
128 |
129 | # dark3
130 | self.dark3 = nn.Sequential(
131 | Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
132 | CSPLayer(
133 | base_channels * 4,
134 | base_channels * 4,
135 | n=base_depth * 3,
136 | depthwise=depthwise,
137 | act=act,
138 | ),
139 | )
140 |
141 | # dark4
142 | self.dark4 = nn.Sequential(
143 | Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
144 | CSPLayer(
145 | base_channels * 8,
146 | base_channels * 8,
147 | n=base_depth * 3,
148 | depthwise=depthwise,
149 | act=act,
150 | ),
151 | )
152 |
153 | # dark5
154 | self.dark5 = nn.Sequential(
155 | Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
156 | SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
157 | CSPLayer(
158 | base_channels * 16,
159 | base_channels * 16,
160 | n=base_depth,
161 | shortcut=False,
162 | depthwise=depthwise,
163 | act=act,
164 | ),
165 | )
166 |
167 | def forward(self, x):
168 | outputs = {}
169 | x = self.stem(x)
170 | outputs["stem"] = x
171 | x = self.dark2(x)
172 | outputs["dark2"] = x
173 | x = self.dark3(x)
174 | outputs["dark3"] = x
175 | x = self.dark4(x)
176 | outputs["dark4"] = x
177 | x = self.dark5(x)
178 | outputs["dark5"] = x
179 | return {k: v for k, v in outputs.items() if k in self.out_features}
180 |
--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | """
13 | Plotting utilities to visualize training logs.
14 | """
15 | import cv2
16 | import torch
17 | import pandas as pd
18 | import numpy as np
19 | import seaborn as sns
20 | import matplotlib.pyplot as plt
21 |
22 | from torch import Tensor
23 |
24 | from pathlib import Path, PurePath
25 |
26 |
27 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
28 | '''
29 | Function to plot specific fields from training log(s). Plots both training and test results.
30 |
31 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
32 | - fields = which results to plot from each log file - plots both training and test for each field.
33 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
34 | - log_name = optional, name of log file if different than default 'log.txt'.
35 |
36 | :: Outputs - matplotlib plots of results in fields, color coded for each log file.
37 | - solid lines are training results, dashed lines are test results.
38 |
39 | '''
40 | func_name = "plot_utils.py::plot_logs"
41 |
42 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
43 | # convert single Path to list to avoid 'not iterable' error
44 |
45 | if not isinstance(logs, list):
46 | if isinstance(logs, PurePath):
47 | logs = [logs]
48 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
49 | else:
50 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
51 | Expect list[Path] or single Path obj, received {type(logs)}")
52 |
53 | # verify valid dir(s) and that every item in list is Path object
54 | for i, dir in enumerate(logs):
55 | if not isinstance(dir, PurePath):
56 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
57 | if dir.exists():
58 | continue
59 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
60 |
61 | # load log file(s) and plot
62 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
63 |
64 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
65 |
66 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
67 | for j, field in enumerate(fields):
68 | if field == 'mAP':
69 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
70 | axs[j].plot(coco_eval, c=color)
71 | else:
72 | df.interpolate().ewm(com=ewm_col).mean().plot(
73 | y=[f'train_{field}', f'test_{field}'],
74 | ax=axs[j],
75 | color=[color] * 2,
76 | style=['-', '--']
77 | )
78 | for ax, field in zip(axs, fields):
79 | ax.legend([Path(p).name for p in logs])
80 | ax.set_title(field)
81 |
82 |
83 | def plot_precision_recall(files, naming_scheme='iter'):
84 | if naming_scheme == 'exp_id':
85 | # name becomes exp_id
86 | names = [f.parts[-3] for f in files]
87 | elif naming_scheme == 'iter':
88 | names = [f.stem for f in files]
89 | else:
90 | raise ValueError(f'not supported {naming_scheme}')
91 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
92 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
93 | data = torch.load(f)
94 | # precision is n_iou, n_points, n_cat, n_area, max_det
95 | precision = data['precision']
96 | recall = data['params'].recThrs
97 | scores = data['scores']
98 | # take precision for all classes, all areas and 100 detections
99 | precision = precision[0, :, :, 0, -1].mean(1)
100 | scores = scores[0, :, :, 0, -1].mean(1)
101 | prec = precision.mean()
102 | rec = data['recall'][0, :, 0, -1].mean()
103 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
104 | f'score={scores.mean():0.3f}, ' +
105 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
106 | )
107 | axs[0].plot(recall, precision, c=color)
108 | axs[1].plot(recall, scores, c=color)
109 |
110 | axs[0].set_title('Precision / Recall')
111 | axs[0].legend(names)
112 | axs[1].set_title('Scores / Recall')
113 | axs[1].legend(names)
114 | return fig, axs
115 |
116 |
117 | def draw_boxes(image: Tensor, boxes: Tensor, color=(0, 255, 0), texts=None) -> np.ndarray:
118 | if isinstance(image, Tensor):
119 | cv_image = image.detach().cpu().numpy()
120 | else:
121 | cv_image = image
122 | if isinstance(boxes, Tensor):
123 | cv_boxes = boxes.detach().cpu().numpy()
124 | else:
125 | cv_boxes = boxes
126 |
127 | tl = round(0.002 * max(image.shape[0:2])) + 1 # line thickness
128 | tf = max(tl - 1, 1)
129 | for i in range(len(boxes)):
130 | box = cv_boxes[i]
131 | x1, y1 = box[0:2]
132 | x2, y2 = box[2:4]
133 | cv2.rectangle(cv_image, (int(x1), int(y1)), (int(x2), int(y2)), color=color)
134 | if texts is not None:
135 | cv2.putText(cv_image, texts[i], (int(x1), int(y1+10)), 0, tl/3, [225, 255, 255],
136 | thickness=tf,
137 | lineType=cv2.LINE_AA)
138 | return cv_image
139 |
140 |
141 | def draw_ref_pts(image: Tensor, ref_pts: Tensor) -> np.ndarray:
142 | if isinstance(image, Tensor):
143 | cv_image = image.detach().cpu().numpy()
144 | else:
145 | cv_image = image
146 | if isinstance(ref_pts, Tensor):
147 | cv_pts = ref_pts.detach().cpu().numpy()
148 | else:
149 | cv_pts = ref_pts
150 | for i in range(len(cv_pts)):
151 | x, y, is_pos = cv_pts[i]
152 | color = (0, 1, 0) if is_pos else (1, 1, 1)
153 | cv2.circle(cv_image, (int(x), int(y)), 2, color)
154 | return cv_image
155 |
156 |
157 | def image_hwc2chw(image: np.ndarray):
158 | image = np.ascontiguousarray(image.transpose(2, 0, 1))
159 | return image
160 |
--------------------------------------------------------------------------------
/models/network_blocks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
4 |
5 | import torch
6 | import torch.nn as nn
7 |
8 |
9 | class SiLU(nn.Module):
10 | """export-friendly version of nn.SiLU()"""
11 |
12 | @staticmethod
13 | def forward(x):
14 | return x * torch.sigmoid(x)
15 |
16 |
17 | def get_activation(name="silu", inplace=True):
18 | if name == "silu":
19 | # module = nn.SiLU(inplace=inplace)
20 | module = SiLU()
21 | elif name == "relu":
22 | module = nn.ReLU(inplace=inplace)
23 | elif name == "lrelu":
24 | module = nn.LeakyReLU(0.1, inplace=inplace)
25 | else:
26 | raise AttributeError("Unsupported act type: {}".format(name))
27 | return module
28 |
29 |
30 | class BaseConv(nn.Module):
31 | """A Conv2d -> Batchnorm -> silu/leaky relu block"""
32 |
33 | def __init__(
34 | self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
35 | ):
36 | super().__init__()
37 | # same padding
38 | pad = (ksize - 1) // 2
39 | self.conv = nn.Conv2d(
40 | in_channels,
41 | out_channels,
42 | kernel_size=ksize,
43 | stride=stride,
44 | padding=pad,
45 | groups=groups,
46 | bias=bias,
47 | )
48 | self.bn = nn.BatchNorm2d(out_channels)
49 | self.act = get_activation(act, inplace=True)
50 |
51 | def forward(self, x):
52 | return self.act(self.bn(self.conv(x)))
53 |
54 | def fuseforward(self, x):
55 | return self.act(self.conv(x))
56 |
57 |
58 | class DWConv(nn.Module):
59 | """Depthwise Conv + Conv"""
60 |
61 | def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
62 | super().__init__()
63 | self.dconv = BaseConv(
64 | in_channels,
65 | in_channels,
66 | ksize=ksize,
67 | stride=stride,
68 | groups=in_channels,
69 | act=act,
70 | )
71 | self.pconv = BaseConv(
72 | in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
73 | )
74 |
75 | def forward(self, x):
76 | x = self.dconv(x)
77 | return self.pconv(x)
78 |
79 |
80 | class Bottleneck(nn.Module):
81 | # Standard bottleneck
82 | def __init__(
83 | self,
84 | in_channels,
85 | out_channels,
86 | shortcut=True,
87 | expansion=0.5,
88 | depthwise=False,
89 | act="silu",
90 | ):
91 | super().__init__()
92 | hidden_channels = int(out_channels * expansion)
93 | Conv = DWConv if depthwise else BaseConv
94 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
95 | self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
96 | self.use_add = shortcut and in_channels == out_channels
97 |
98 | def forward(self, x):
99 | y = self.conv2(self.conv1(x))
100 | if self.use_add:
101 | y = y + x
102 | return y
103 |
104 |
105 | class ResLayer(nn.Module):
106 | "Residual layer with `in_channels` inputs."
107 |
108 | def __init__(self, in_channels: int):
109 | super().__init__()
110 | mid_channels = in_channels // 2
111 | self.layer1 = BaseConv(
112 | in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
113 | )
114 | self.layer2 = BaseConv(
115 | mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
116 | )
117 |
118 | def forward(self, x):
119 | out = self.layer2(self.layer1(x))
120 | return x + out
121 |
122 |
123 | class SPPBottleneck(nn.Module):
124 | """Spatial pyramid pooling layer used in YOLOv3-SPP"""
125 |
126 | def __init__(
127 | self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
128 | ):
129 | super().__init__()
130 | hidden_channels = in_channels // 2
131 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
132 | self.m = nn.ModuleList(
133 | [
134 | nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
135 | for ks in kernel_sizes
136 | ]
137 | )
138 | conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
139 | self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
140 |
141 | def forward(self, x):
142 | x = self.conv1(x)
143 | x = torch.cat([x] + [m(x) for m in self.m], dim=1)
144 | x = self.conv2(x)
145 | return x
146 |
147 |
148 | class CSPLayer(nn.Module):
149 | """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
150 |
151 | def __init__(
152 | self,
153 | in_channels,
154 | out_channels,
155 | n=1,
156 | shortcut=True,
157 | expansion=0.5,
158 | depthwise=False,
159 | act="silu",
160 | ):
161 | """
162 | Args:
163 | in_channels (int): input channels.
164 | out_channels (int): output channels.
165 | n (int): number of Bottlenecks. Default value: 1.
166 | """
167 | # ch_in, ch_out, number, shortcut, groups, expansion
168 | super().__init__()
169 | hidden_channels = int(out_channels * expansion) # hidden channels
170 | self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
171 | self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
172 | self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
173 | module_list = [
174 | Bottleneck(
175 | hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
176 | )
177 | for _ in range(n)
178 | ]
179 | self.m = nn.Sequential(*module_list)
180 |
181 | def forward(self, x):
182 | x_1 = self.conv1(x)
183 | x_2 = self.conv2(x)
184 | x_1 = self.m(x_1)
185 | x = torch.cat((x_1, x_2), dim=1)
186 | return self.conv3(x)
187 |
188 |
189 | class Focus(nn.Module):
190 | """Focus width and height information into channel space."""
191 |
192 | def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
193 | super().__init__()
194 | self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
195 |
196 | def forward(self, x):
197 | # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
198 | patch_top_left = x[..., ::2, ::2]
199 | patch_top_right = x[..., ::2, 1::2]
200 | patch_bot_left = x[..., 1::2, ::2]
201 | patch_bot_right = x[..., 1::2, 1::2]
202 | x = torch.cat(
203 | (
204 | patch_top_left,
205 | patch_bot_left,
206 | patch_top_right,
207 | patch_bot_right,
208 | ),
209 | dim=1,
210 | )
211 | return self.conv(x)
212 |
--------------------------------------------------------------------------------
/models/dino/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import warnings
14 | import math
15 |
16 | import torch
17 | from torch import nn
18 | import torch.nn.functional as F
19 | from torch.nn.init import xavier_uniform_, constant_
20 |
21 | from ..functions import MSDeformAttnFunction
22 |
23 |
24 | def _is_power_of_2(n):
25 | if (not isinstance(n, int)) or (n < 0):
26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
27 | return (n & (n-1) == 0) and n != 0
28 |
29 |
30 | class MSDeformAttn(nn.Module):
31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
32 | """
33 | Multi-Scale Deformable Attention Module
34 | :param d_model hidden dimension
35 | :param n_levels number of feature levels
36 | :param n_heads number of attention heads
37 | :param n_points number of sampling points per attention head per feature level
38 | """
39 | super().__init__()
40 | if d_model % n_heads != 0:
41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
42 | _d_per_head = d_model // n_heads
43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
44 | if not _is_power_of_2(_d_per_head):
45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
46 | "which is more efficient in our CUDA implementation.")
47 |
48 | self.im2col_step = 64
49 |
50 | self.d_model = d_model
51 | self.n_levels = n_levels
52 | self.n_heads = n_heads
53 | self.n_points = n_points
54 |
55 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
56 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
57 | self.value_proj = nn.Linear(d_model, d_model)
58 | self.output_proj = nn.Linear(d_model, d_model)
59 |
60 | self._reset_parameters()
61 |
62 | def _reset_parameters(self):
63 | constant_(self.sampling_offsets.weight.data, 0.)
64 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
65 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
66 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
67 | for i in range(self.n_points):
68 | grid_init[:, :, i, :] *= i + 1
69 | with torch.no_grad():
70 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
71 | constant_(self.attention_weights.weight.data, 0.)
72 | constant_(self.attention_weights.bias.data, 0.)
73 | xavier_uniform_(self.value_proj.weight.data)
74 | constant_(self.value_proj.bias.data, 0.)
75 | xavier_uniform_(self.output_proj.weight.data)
76 | constant_(self.output_proj.bias.data, 0.)
77 |
78 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
79 | """
80 | :param query (N, Length_{query}, C)
81 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
82 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
83 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
84 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
85 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
86 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
87 |
88 | :return output (N, Length_{query}, C)
89 | """
90 | N, Len_q, _ = query.shape
91 | N, Len_in, _ = input_flatten.shape
92 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
93 |
94 | value = self.value_proj(input_flatten)
95 | if input_padding_mask is not None:
96 | value = value.masked_fill(input_padding_mask[..., None], float(0))
97 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
98 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
99 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
100 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
101 | # N, Len_q, n_heads, n_levels, n_points, 2
102 | if reference_points.shape[-1] == 2:
103 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
104 | sampling_locations = reference_points[:, :, None, :, None, :] \
105 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
106 | elif reference_points.shape[-1] == 4:
107 | sampling_locations = reference_points[:, :, None, :, None, :2] \
108 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
109 | else:
110 | raise ValueError(
111 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
112 |
113 | # for amp
114 | if value.dtype == torch.float16:
115 | # for mixed precision
116 | output = MSDeformAttnFunction.apply(
117 | value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
118 | output = output.to(torch.float16)
119 | output = self.output_proj(output)
120 | return output
121 |
122 |
123 | output = MSDeformAttnFunction.apply(
124 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
125 | output = self.output_proj(output)
126 | return output
127 |
--------------------------------------------------------------------------------
/models/dino/utils.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # DINO
3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 |
7 | import torch
8 | from torch import nn, Tensor
9 |
10 | import math
11 | import torch.nn.functional as F
12 | from torch import nn
13 |
14 |
15 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor, learnedwh=None):
16 | """
17 | Input:
18 | - memory: bs, \sum{hw}, d_model
19 | - memory_padding_mask: bs, \sum{hw}
20 | - spatial_shapes: nlevel, 2
21 | - learnedwh: 2
22 | Output:
23 | - output_memory: bs, \sum{hw}, d_model
24 | - output_proposals: bs, \sum{hw}, 4
25 | """
26 | N_, S_, C_ = memory.shape
27 | base_scale = 4.0
28 | proposals = []
29 | _cur = 0
30 | for lvl, (H_, W_) in enumerate(spatial_shapes):
31 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
32 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
33 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
34 |
35 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
36 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
37 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
38 |
39 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
40 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
41 |
42 | if learnedwh is not None:
43 | wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
44 | else:
45 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
46 |
47 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
48 | proposals.append(proposal)
49 | _cur += (H_ * W_)
50 |
51 | output_proposals = torch.cat(proposals, 1)
52 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
53 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
54 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
55 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
56 |
57 | output_memory = memory
58 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
59 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
60 |
61 | return output_memory, output_proposals
62 |
63 |
64 | class RandomBoxPerturber():
65 | def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
66 | self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
67 |
68 | def __call__(self, refanchors: Tensor) -> Tensor:
69 | nq, bs, query_dim = refanchors.shape
70 | device = refanchors.device
71 |
72 | noise_raw = torch.rand_like(refanchors)
73 | noise_scale = self.noise_scale.to(device)[:query_dim]
74 |
75 | new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
76 | return new_refanchors.clamp_(0, 1)
77 |
78 |
79 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
80 | """
81 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
82 | Args:
83 | inputs: A float tensor of arbitrary shape.
84 | The predictions for each example.
85 | targets: A float tensor with the same shape as inputs. Stores the binary
86 | classification label for each element in inputs
87 | (0 for the negative class and 1 for the positive class).
88 | alpha: (optional) Weighting factor in range (0,1) to balance
89 | positive vs negative examples. Default = -1 (no weighting).
90 | gamma: Exponent of the modulating factor (1 - p_t) to
91 | balance easy vs hard examples.
92 | Returns:
93 | Loss tensor
94 | """
95 | prob = inputs.sigmoid()
96 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
97 | p_t = prob * targets + (1 - prob) * (1 - targets)
98 | loss = ce_loss * ((1 - p_t) ** gamma)
99 |
100 | if alpha >= 0:
101 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
102 | loss = alpha_t * loss
103 |
104 | return loss.mean(1).sum() / num_boxes
105 |
106 |
107 | class MLP(nn.Module):
108 | """ Very simple multi-layer perceptron (also called FFN)"""
109 |
110 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
111 | super().__init__()
112 | self.num_layers = num_layers
113 | h = [hidden_dim] * (num_layers - 1)
114 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
115 |
116 | def forward(self, x):
117 | for i, layer in enumerate(self.layers):
118 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
119 | return x
120 |
121 |
122 | def _get_activation_fn(activation, d_model=256, batch_dim=0):
123 | """Return an activation function given a string"""
124 | if activation == "relu":
125 | return F.relu
126 | if activation == "gelu":
127 | return F.gelu
128 | if activation == "glu":
129 | return F.glu
130 | if activation == "prelu":
131 | return nn.PReLU()
132 | if activation == "selu":
133 | return F.selu
134 |
135 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
136 |
137 |
138 | def gen_sineembed_for_position(pos_tensor):
139 | # n_query, bs, _ = pos_tensor.size()
140 | # sineembed_tensor = torch.zeros(n_query, bs, 256)
141 | scale = 2 * math.pi
142 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
143 | dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128)
144 | x_embed = pos_tensor[:, :, 0] * scale
145 | y_embed = pos_tensor[:, :, 1] * scale
146 | pos_x = x_embed[:, :, None] / dim_t
147 | pos_y = y_embed[:, :, None] / dim_t
148 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
149 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
150 | if pos_tensor.size(-1) == 2:
151 | pos = torch.cat((pos_y, pos_x), dim=2)
152 | elif pos_tensor.size(-1) == 4:
153 | w_embed = pos_tensor[:, :, 2] * scale
154 | pos_w = w_embed[:, :, None] / dim_t
155 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
156 |
157 | h_embed = pos_tensor[:, :, 3] * scale
158 | pos_h = h_embed[:, :, None] / dim_t
159 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
160 |
161 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
162 | else:
163 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
164 | return pos
--------------------------------------------------------------------------------
/models/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import warnings
17 | import math
18 |
19 | import torch
20 | from torch import nn
21 | import torch.nn.functional as F
22 | from torch.nn.init import xavier_uniform_, constant_
23 |
24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
25 | features_grad=0.0
26 |
27 | def _is_power_of_2(n):
28 | if (not isinstance(n, int)) or (n < 0):
29 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
30 | return (n & (n-1) == 0) and n != 0
31 |
32 |
33 | class MSDeformAttn(nn.Module):
34 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64):
35 | """
36 | Multi-Scale Deformable Attention Module
37 | :param d_model hidden dimension
38 | :param n_levels number of feature levels
39 | :param n_heads number of attention heads
40 | :param n_points number of sampling points per attention head per feature level
41 | """
42 | super().__init__()
43 | if d_model % n_heads != 0:
44 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
45 | _d_per_head = d_model // n_heads
46 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
47 | if not _is_power_of_2(_d_per_head):
48 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
49 | "which is more efficient in our CUDA implementation.")
50 |
51 | self.im2col_step = im2col_step
52 | self.sigmoid_attn = sigmoid_attn
53 |
54 | self.d_model = d_model
55 | self.n_levels = n_levels
56 | self.n_heads = n_heads
57 | self.n_points = n_points
58 |
59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
61 | self.value_proj = nn.Linear(d_model, d_model)
62 | self.output_proj = nn.Linear(d_model, d_model)
63 |
64 | self._reset_parameters()
65 |
66 | def _reset_parameters(self):
67 | constant_(self.sampling_offsets.weight.data, 0.)
68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
71 | for i in range(self.n_points):
72 | grid_init[:, :, i, :] *= i + 1
73 | with torch.no_grad():
74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
75 | constant_(self.attention_weights.weight.data, 0.)
76 | constant_(self.attention_weights.bias.data, 0.)
77 | xavier_uniform_(self.value_proj.weight.data)
78 | constant_(self.value_proj.bias.data, 0.)
79 | xavier_uniform_(self.output_proj.weight.data)
80 | constant_(self.output_proj.bias.data, 0.)
81 |
82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
83 | """
84 | :param query (N, Length_{query}, C)
85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
91 |
92 | :return output (N, Length_{query}, C)
93 | """
94 | N, Len_q, _ = query.shape
95 | N, Len_in, _ = input_flatten.shape
96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
97 |
98 | value = self.value_proj(input_flatten)
99 | if input_padding_mask is not None:
100 | value.masked_fill_(input_padding_mask[..., None], float(0))
101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 | if self.sigmoid_attn:
105 | attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
106 | else:
107 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
108 | # N, Len_q, n_heads, n_levels, n_points, 2
109 | if reference_points.shape[-1] == 2:
110 | sampling_locations = reference_points[:, :, None, :, None, :] \
111 | + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)]
112 | elif reference_points.shape[-1] == 4:
113 | sampling_locations = reference_points[:, :, None, :, None, :2] \
114 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
115 | else:
116 | raise ValueError(
117 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
118 |
119 | # def extract(g):
120 | # global features_grad
121 | # features_grad = g
122 | # value.requires_grad=True
123 | # value.register_hook(extract)
124 |
125 | output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
126 | # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float()
127 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
128 |
129 | output = self.output_proj(output)
130 | return output
131 |
--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
3 | # ------------------------------------------------------------------------
4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
6 | # ------------------------------------------------------------------------
7 | # Modified from DETR (https://github.com/facebookresearch/detr)
8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9 | # ------------------------------------------------------------------------
10 |
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import warnings
17 | import math
18 |
19 | import torch
20 | from torch import nn
21 | import torch.nn.functional as F
22 | from torch.nn.init import xavier_uniform_, constant_
23 |
24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
25 | features_grad=0.0
26 |
27 | def _is_power_of_2(n):
28 | if (not isinstance(n, int)) or (n < 0):
29 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
30 | return (n & (n-1) == 0) and n != 0
31 |
32 |
33 | class MSDeformAttn(nn.Module):
34 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64):
35 | """
36 | Multi-Scale Deformable Attention Module
37 | :param d_model hidden dimension
38 | :param n_levels number of feature levels
39 | :param n_heads number of attention heads
40 | :param n_points number of sampling points per attention head per feature level
41 | """
42 | super().__init__()
43 | if d_model % n_heads != 0:
44 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
45 | _d_per_head = d_model // n_heads
46 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
47 | if not _is_power_of_2(_d_per_head):
48 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
49 | "which is more efficient in our CUDA implementation.")
50 |
51 | self.im2col_step = im2col_step
52 | self.sigmoid_attn = sigmoid_attn
53 |
54 | self.d_model = d_model
55 | self.n_levels = n_levels
56 | self.n_heads = n_heads
57 | self.n_points = n_points
58 |
59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
61 | self.value_proj = nn.Linear(d_model, d_model)
62 | self.output_proj = nn.Linear(d_model, d_model)
63 |
64 | self._reset_parameters()
65 |
66 | def _reset_parameters(self):
67 | constant_(self.sampling_offsets.weight.data, 0.)
68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
71 | for i in range(self.n_points):
72 | grid_init[:, :, i, :] *= i + 1
73 | with torch.no_grad():
74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
75 | constant_(self.attention_weights.weight.data, 0.)
76 | constant_(self.attention_weights.bias.data, 0.)
77 | xavier_uniform_(self.value_proj.weight.data)
78 | constant_(self.value_proj.bias.data, 0.)
79 | xavier_uniform_(self.output_proj.weight.data)
80 | constant_(self.output_proj.bias.data, 0.)
81 |
82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
83 | """
84 | :param query (N, Length_{query}, C)
85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
91 |
92 | :return output (N, Length_{query}, C)
93 | """
94 | N, Len_q, _ = query.shape
95 | N, Len_in, _ = input_flatten.shape
96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
97 |
98 | value = self.value_proj(input_flatten)
99 | if input_padding_mask is not None:
100 | value.masked_fill_(input_padding_mask[..., None], float(0))
101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 | if self.sigmoid_attn:
105 | attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
106 | else:
107 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
108 | # N, Len_q, n_heads, n_levels, n_points, 2
109 | if reference_points.shape[-1] == 2:
110 | sampling_locations = reference_points[:, :, None, :, None, :] \
111 | + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)]
112 | elif reference_points.shape[-1] == 4:
113 | sampling_locations = reference_points[:, :, None, :, None, :2] \
114 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
115 | else:
116 | raise ValueError(
117 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
118 |
119 | # def extract(g):
120 | # global features_grad
121 | # features_grad = g
122 | # value.requires_grad=True
123 | # value.register_hook(extract)
124 |
125 | output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
126 | # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float()
127 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
128 |
129 | output = self.output_proj(output)
130 | return output
131 |
--------------------------------------------------------------------------------