├── requirements.txt
├── .gitignore
├── models
    ├── ops
    │   ├── MultiScaleDeformableAttention.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── PKG-INFO
    │   │   └── SOURCES.txt
    │   ├── build
    │   │   ├── temp.linux-x86_64-cpython-37
    │   │   │   ├── .ninja_deps
    │   │   │   ├── mnt
    │   │   │   │   └── dolphinfs
    │   │   │   │   │   └── hdd_pool
    │   │   │   │   │       └── docker
    │   │   │   │   │           └── user
    │   │   │   │   │               └── hadoop-vacv
    │   │   │   │   │                   └── yanfeng
    │   │   │   │   │                       └── project
    │   │   │   │   │                           └── MOTRv2
    │   │   │   │   │                               └── MOTRv3
    │   │   │   │   │                                   └── models
    │   │   │   │   │                                       └── ops
    │   │   │   │   │                                           └── src
    │   │   │   │   │                                               ├── vision.o
    │   │   │   │   │                                               ├── cpu
    │   │   │   │   │                                                   └── ms_deform_attn_cpu.o
    │   │   │   │   │                                               └── cuda
    │   │   │   │   │                                                   └── ms_deform_attn_cuda.o
    │   │   │   ├── .ninja_log
    │   │   │   └── build.ninja
    │   │   ├── lib.linux-x86_64-3.8
    │   │   │   ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn.py
    │   │   │   └── functions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn_func.py
    │   │   ├── lib.linux-x86_64-cpython-37
    │   │   │   ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
    │   │   │   ├── modules
    │   │   │   │   └── __init__.py
    │   │   │   └── functions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn_func.py
    │   │   └── temp.linux-x86_64-3.8
    │   │   │   └── mnt
    │   │   │       └── dolphinfs
    │   │   │           └── hdd_pool
    │   │   │               └── docker
    │   │   │                   └── user
    │   │   │                       └── hadoop-vacv
    │   │   │                           └── yanfeng
    │   │   │                               └── project
    │   │   │                                   └── MOTRv2
    │   │   │                                       └── CO-MOT
    │   │   │                                           └── models
    │   │   │                                               └── ops
    │   │   │                                                   └── src
    │   │   │                                                       ├── vision.o
    │   │   │                                                       ├── cpu
    │   │   │                                                           └── ms_deform_attn_cpu.o
    │   │   │                                                       └── cuda
    │   │   │                                                           └── ms_deform_attn_cuda.o
    │   ├── MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so
    │   ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
    │   ├── dist
    │   │   └── MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg
    │   ├── make.sh
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn_func.py
    │   ├── src
    │   │   ├── vision.cpp
    │   │   ├── cuda
    │   │   │   └── ms_deform_attn_cuda.h
    │   │   ├── cpu
    │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   └── ms_deform_attn_cpu.cpp
    │   │   └── ms_deform_attn.h
    │   ├── setup.py
    │   └── test.py
    ├── structures
    │   └── __init__.py
    ├── dino
    │   ├── __init__.py
    │   ├── ops
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   └── ms_deform_attn.py
    │   │   ├── functions
    │   │   │   ├── __init__.py
    │   │   │   └── ms_deform_attn_func.py
    │   │   ├── make.sh
    │   │   ├── src
    │   │   │   ├── vision.cpp
    │   │   │   ├── cuda
    │   │   │   │   └── ms_deform_attn_cuda.h
    │   │   │   ├── cpu
    │   │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   │   └── ms_deform_attn_cpu.cpp
    │   │   │   └── ms_deform_attn.h
    │   │   ├── setup.py
    │   │   └── test.py
    │   ├── position_encoding.py
    │   └── utils.py
    ├── registry.py
    ├── __init__.py
    ├── yolo_fpn.py
    ├── losses.py
    ├── yolo_pafpn.py
    ├── position_encoding.py
    ├── memory_bank.py
    ├── yolox.py
    ├── darknet.py
    └── network_blocks.py
├── tools
    ├── show_user_using_nvidia.sh
    ├── copy_back.sh
    ├── debug.sh
    ├── simple_inference.sh
    ├── simplebdd_inference.sh
    ├── simplemot_inference.sh
    ├── batch_diff.py
    ├── merge_dance_tracklets.sh
    ├── resume.sh
    ├── run_dist_launch.sh
    ├── eval_dance.sh
    ├── run_dist_slurm.sh
    ├── make_detdb.py
    ├── coco_evel.py
    ├── train.sh
    ├── clip_train.py
    ├── merge_dance_tracklets.py
    ├── train_ddp.sh
    ├── similarity_analysis.py
    └── visualize_tao.py
├── datasets
    ├── alignment.txt
    ├── __init__.py
    ├── panoptic_eval.py
    ├── data_prefetcher.py
    └── samplers.py
├── util
    ├── json_parser.py
    ├── __init__.py
    ├── checkpoint.py
    ├── box_ops.py
    ├── tool.py
    └── plot_utils.py
├── configs
    └── motrv2ch_uni5cost3ggoon.args
└── .vscode
    └── launch.json


/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | scipy
3 | opencv-python
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tracker*/
2 | exps
3 | __pycache__
4 | tmp
5 | checkpoints


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 | 


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_deps


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.7-linux-x86_64.egg


--------------------------------------------------------------------------------
/tools/show_user_using_nvidia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pids=$(fuser -v /dev/nvidia* | cut -d' ' -f3- | tr ' ' '\n' | sort -u)
4 | for pid in $pids
5 | do
6 |    echo "PID: $pid CWD: $(readlink /proc/$pid/cwd)"
7 | done


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-3.8/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/lib.linux-x86_64-cpython-37/MultiScaleDeformableAttention.cpython-37m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/datasets/alignment.txt:
--------------------------------------------------------------------------------
1 | <Img><ImageHere></Img> Describe this image in detail.
2 | <Img><ImageHere></Img> Take a look at this image and describe what you notice.
3 | <Img><ImageHere></Img> Please provide a detailed description of the picture.
4 | <Img><ImageHere></Img> Could you describe the contents of this image for me?


--------------------------------------------------------------------------------
/util/json_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | def parse(str, key):
 5 |     str_dict = json.loads(str)
 6 |     val = str_dict[key]
 7 |     if type(val)==list:
 8 |         return ",".join(val)
 9 |     else:
10 |         return val
11 | 
12 | if __name__ == '__main__':
13 |     parse(sys.argv[1], sys.argv[2])


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o


--------------------------------------------------------------------------------
/tools/copy_back.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | 
 5 | 
 6 | set -x
 7 | 
 8 | cp $1/*.py .
 9 | cp $1/models/*.py models
10 | cp $1/datasets/*.py datasets
11 | 


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-3.8/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BingfengYan/CO-MOT/HEAD/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: MultiScaleDeformableAttention
 3 | Version: 1.0
 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
 6 | Author: Weijie Su
 7 | License: UNKNOWN
 8 | Platform: UNKNOWN
 9 | 
10 | UNKNOWN
11 | 
12 | 


--------------------------------------------------------------------------------
/tools/debug.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | 
 9 | args=$(cat $1)
10 | 
11 | export CUDA_LAUNCH_BLOCKING=1
12 | python main.py ${args} --output_dir /tmp/clip_mot_v2
13 | 


--------------------------------------------------------------------------------
/tools/simple_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | set -o pipefail
 9 | 
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_dance.py ${args} --exp_name tracker --resume $2 $3
13 | 


--------------------------------------------------------------------------------
/tools/simplebdd_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | set -o pipefail
 9 | 
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_bdd.py ${args} --exp_name tracker --resume $2 $3
13 | 


--------------------------------------------------------------------------------
/tools/simplemot_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | set -o pipefail
 9 | 
10 | # args=$(cat configs/motrv2.args)
11 | args=$(cat $1)
12 | python3 submit_mot.py ${args} --exp_name tracker --resume $2 $3
13 | 


--------------------------------------------------------------------------------
/models/structures/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Detectron2 (https://github.com/facebookresearch/detectron2)
3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
4 | # ------------------------------------------------------------------------
5 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, matched_boxlist_iou
6 | from .instances import Instances
7 | 
8 | __all__ = [k for k in globals().keys() if not k.startswith("_")]


--------------------------------------------------------------------------------
/configs/motrv2ch_uni5cost3ggoon.args:
--------------------------------------------------------------------------------
 1 | --meta_arch motr_unincost
 2 | --dataset_file e2e_dance
 3 | --epoch 20
 4 | --with_box_refine
 5 | --lr_drop 8
 6 | --lr 2e-4
 7 | --lr_backbone 2e-5
 8 | --pretrained xx/checkpoint0019.pth
 9 | --batch_size 1
10 | --sample_mode random_interval
11 | --sample_interval 10
12 | --sampler_lengths 5
13 | --merger_dropout 0
14 | --dropout 0
15 | --random_drop 0.1
16 | --fp_ratio 0.3
17 | --query_interaction_layer GQIM
18 | --num_queries 60
19 | --append_crowd
20 | --use_checkpoint
21 | --mot_path xxx/data/
22 | --match_type gmatch
23 | --g_size 3
24 | 


--------------------------------------------------------------------------------
/models/dino/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Conditional DETR
 3 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Copied from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .dino import build_dino
11 | 


--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | python setup.py build install
10 | 


--------------------------------------------------------------------------------
/models/dino/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/models/dino/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 


--------------------------------------------------------------------------------
/tools/batch_diff.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | 
 5 | 
 6 | import argparse
 7 | from glob import glob
 8 | from subprocess import run
 9 | 
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('src')
13 | parser.add_argument('dst')
14 | args = parser.parse_args()
15 | 
16 | 
17 | for src in glob(args.src+'/*/*.py') + glob(args.src+'/*.py'):
18 |     dst = src.replace(args.src, args.dst)
19 |     if run(['diff', src, dst]).returncode != 0:
20 |         print('code --diff', src, dst)
21 | 


--------------------------------------------------------------------------------
/models/dino/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | 
11 | # TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='/path/to/your/cuda/dir'  
12 | python setup.py build install
13 | 


--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn import MSDeformAttn


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn import MSDeformAttn


--------------------------------------------------------------------------------
/tools/merge_dance_tracklets.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | 
 5 | python tools/merge_dance_tracklets.py $1 $2
 6 | 
 7 | # python3 ../TrackEval/scripts/run_mot_challenge.py \
 8 | #     --SPLIT_TO_EVAL val  \
 9 | #     --METRICS HOTA \
10 | #     --GT_FOLDER /data/datasets/dancetrack/val \
11 | #     --SEQMAP_FILE seqmap \
12 | #     --SKIP_SPLIT_FOL True \
13 | #     --TRACKER_SUB_FOLDER tracker \
14 | #     --TRACKERS_TO_EVAL $2 \
15 | #     --USE_PARALLEL True \
16 | #     --NUM_PARALLEL_CORES 8 \
17 | #     --PLOT_CURVES False \
18 | #     --TRACKERS_FOLDER '' | tee -a $2/eval.log
19 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn import MSDeformAttn


--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 | 
14 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 | 
14 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
13 | 
14 | 


--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/.ninja_log:
--------------------------------------------------------------------------------
1 | # ninja log v5
2 | 8	50326	1682736456000000000	/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o	1a7a04fa8aa332bc
3 | 14	91444	1682736491000000000	/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o	781d7dd8aea58757
4 | 3	109768	1682736515000000000	/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o	67f3872547af6227
5 | 


--------------------------------------------------------------------------------
/tools/resume.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | 
 9 | set -o pipefail
10 | 
11 | OUTPUT_DIR=$1
12 | 
13 | # clean up *.pyc files
14 | rmpyc() {
15 |   rm -rf $(find -name __pycache__)
16 |   rm -rf $(find -name "*.pyc")
17 | }
18 | 
19 | # tar src to avoid future editing
20 | cleanup() {
21 |   echo "Packing source code"
22 |   rmpyc
23 |   # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
24 |   echo " ...Done"
25 | }
26 | 
27 | 
28 | pushd $OUTPUT_DIR
29 | trap cleanup EXIT
30 | 
31 | args=$(cat *.args)
32 | python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py ${args} --resume checkpoint.pth --output_dir . |& tee -a resume.log
33 | popd
34 | 


--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/tools/run_dist_launch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | # ------------------------------------------------------------------------
 8 | # Modified from DETR (https://github.com/facebookresearch/detr)
 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 | 
12 | 
13 | set -x
14 | 
15 | GPUS=$1
16 | RUN_COMMAND=${@:2}
17 | if [ $GPUS -lt 8 ]; then
18 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
19 | else
20 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
21 | fi
22 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
23 | MASTER_PORT=${MASTER_PORT:-"29500"}
24 | NODE_RANK=${NODE_RANK:-0}
25 | 
26 | let "NNODES=GPUS/GPUS_PER_NODE"
27 | 
28 | python3 ./tools/launch.py \
29 |     --nnodes ${NNODES} \
30 |     --node_rank ${NODE_RANK} \
31 |     --master_addr ${MASTER_ADDR} \
32 |     --master_port ${MASTER_PORT} \
33 |     --nproc_per_node ${GPUS_PER_NODE} \
34 |     ${RUN_COMMAND}


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/tools/eval_dance.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | 
 7 | set -x
 8 | 
 9 | set -o pipefail
10 | 
11 | OUTPUT_DIR=$1
12 | 
13 | # clean up *.pyc files
14 | rmpyc() {
15 |   rm -rf $(find -name __pycache__)
16 |   rm -rf $(find -name "*.pyc")
17 | }
18 | 
19 | 
20 | cp submit_dance.py $OUTPUT_DIR
21 | 
22 | pushd $OUTPUT_DIR
23 | 
24 | args=$(cat *.args)
25 | # rlaunch --cpu 8 --gpu 1 --memory 24000 --positive-tags 2080ti -P 13 -- python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker
26 | python3 submit_dance.py ${args} --resume checkpoint.pth --exp_name tracker
27 | 
28 | popd
29 | 
30 | # python3 ../TrackEval/scripts/run_mot_challenge.py \
31 | #     --SPLIT_TO_EVAL val  \
32 | #     --METRICS HOTA CLEAR Identity  \
33 | #     --GT_FOLDER /data/datasets/dancetrack/val \
34 | #     --SEQMAP_FILE seqmap \
35 | #     --SKIP_SPLIT_FOL True \
36 | #     --TRACKER_SUB_FOLDER tracker \
37 | #     --TRACKERS_TO_EVAL $OUTPUT_DIR \
38 | #     --USE_PARALLEL True \
39 | #     --NUM_PARALLEL_CORES 8 \
40 | #     --PLOT_CURVES False \
41 | #     --TRACKERS_FOLDER '' | tee -a $OUTPUT_DIR/eval.log
42 | 


--------------------------------------------------------------------------------
/tools/run_dist_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | # ------------------------------------------------------------------------
 8 | # Modified from DETR (https://github.com/facebookresearch/detr)
 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 | 
12 | 
13 | set -x
14 | 
15 | PARTITION=$1
16 | JOB_NAME=$2
17 | GPUS=$3
18 | RUN_COMMAND=${@:4}
19 | if [ $GPUS -lt 8 ]; then
20 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
21 | else
22 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
23 | fi
24 | CPUS_PER_TASK=${CPUS_PER_TASK:-4}
25 | SRUN_ARGS=${SRUN_ARGS:-""}
26 | 
27 | srun -p ${PARTITION} \
28 |     --job-name=${JOB_NAME} \
29 |     --gres=gpu:${GPUS_PER_NODE} \
30 |     --ntasks=${GPUS} \
31 |     --ntasks-per-node=${GPUS_PER_NODE} \
32 |     --cpus-per-task=${CPUS_PER_TASK} \
33 |     --kill-on-bad-exit=1 \
34 |     ${SRUN_ARGS} \
35 |     ${RUN_COMMAND}
36 | 
37 | 


--------------------------------------------------------------------------------
/tools/make_detdb.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | 
 5 | 
 6 | from glob import glob
 7 | import json
 8 | from concurrent.futures import ThreadPoolExecutor
 9 | from threading import Lock
10 | 
11 | from tqdm import tqdm
12 | 
13 | det_db = {}
14 | to_cache = []
15 | 
16 | for file in glob("/data/Dataset/mot/crowdhuman/train_image/*.txt"):
17 |     to_cache.append(file)
18 | 
19 | for file in glob("/data/Dataset/mot/dancetrack/*/*/img1/*.txt"):
20 |     to_cache.append(file)
21 | 
22 | for file in glob("/data/Dataset/mot/MOT17/images/*/*/img1/*.txt"):
23 |     to_cache.append(file)
24 | 
25 | for file in glob("/data/Dataset/mot/MOT20/train/*/img1/*.txt"):
26 |     to_cache.append(file)
27 | 
28 | for file in glob("/data/Dataset/mot/HIE20/train/*/img1/*.txt"):
29 |     to_cache.append(file)
30 | 
31 | pbar = tqdm(total=len(to_cache))
32 | 
33 | mutex = Lock()
34 | def cache(file):
35 |     with open(file) as f:
36 |         tmp = [l for l in f]
37 |     with mutex:
38 |         det_db[file] = tmp
39 |         pbar.update()
40 | 
41 | with ThreadPoolExecutor(max_workers=48) as exe:
42 |     for file in to_cache:
43 |         exe.submit(cache, file)
44 | 
45 | with open("/data/Dataset/mot/det_db_oc_sort_full.json", 'w') as f:
46 |     json.dump(det_db, f)
47 | 
48 | 


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/vision.cpp
 3 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cpu/ms_deform_attn_cpu.cpp
 4 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/models/ops/src/cuda/ms_deform_attn_cuda.cu
 5 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/vision.cpp
 6 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cpu/ms_deform_attn_cpu.cpp
 7 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv2/models/ops/src/cuda/ms_deform_attn_cuda.cu
 8 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp
 9 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp
10 | /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu
11 | MultiScaleDeformableAttention.egg-info/PKG-INFO
12 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
13 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
14 | MultiScaleDeformableAttention.egg-info/top_level.txt
15 | functions/__init__.py
16 | functions/ms_deform_attn_func.py
17 | modules/__init__.py
18 | modules/ms_deform_attn.py


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/tools/coco_evel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | from collections import defaultdict
 5 | from pycocotools.coco import COCO
 6 | from pycocotools.cocoeval import COCOeval
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser('Deformable DETR Detector', add_help=False)
10 | parser.add_argument('--det_root', default='tracker', type=str)
11 | args = parser.parse_args()
12 | 
13 | cocoGt = COCO(annotation_file='data/dancetrack/annotations/val.json')
14 | 
15 | 
16 | det_root = args.det_root
17 | tracklets = defaultdict()
18 | 
19 | detRes = []
20 | for img_id in cocoGt.getImgIds():
21 |     img = cocoGt.loadImgs(img_id)
22 |     
23 |     vid_name = img[0]['file_name'][:14]
24 |     frame_id = img[0]['frame_id'] 
25 |     
26 |     if vid_name not in tracklets:
27 |         tracklets[vid_name] = defaultdict(list)
28 |         for line in open(os.path.join(det_root, vid_name+'.txt')):
29 |             t, id, *xywhs = line.split(',')[:7]
30 |             t, id = map(int, (t, id))
31 |             tracklets[vid_name][t].append((id, *map(float, xywhs)))
32 |     
33 |     labels = tracklets[vid_name][frame_id]
34 |     
35 |     for l in labels:
36 |         ann = defaultdict()
37 |         ann['image_id'] = img[0]['id'] 
38 |         ann['bbox'] = list(l[1:5])
39 |         ann['category_id'] = 1
40 |         ann['score'] = l[5]
41 |         detRes.append(ann)
42 | 
43 | cocoDt = cocoGt.loadRes(detRes)  #自己的生成的结果的路径及文件名，json文件形式
44 | cocoEval = COCOeval(cocoGt, cocoDt, "bbox")
45 | cocoEval.evaluate()
46 | cocoEval.accumulate()
47 | cocoEval.summarize()


--------------------------------------------------------------------------------
/tools/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | # 打印所有指令
 7 | set -x
 8 | 
 9 | PY_ARGS=${@:2}
10 | 
11 | # 脚本运行失败，报错
12 | set -o pipefail
13 | #sed -e  ：直接在指令列模式上進行 sed 的動作編輯；
14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g")
15 | mkdir -p $OUTPUT_BASE
16 | 
17 | for RUN in $(seq 100); do
18 |   ls $OUTPUT_BASE | grep run$RUN && continue
19 |   OUTPUT_DIR=$OUTPUT_BASE/run$RUN
20 |   mkdir $OUTPUT_DIR && break
21 | done
22 | 
23 | # clean up *.pyc files
24 | rmpyc() {
25 |   rm -rf $(find -name __pycache__)
26 |   rm -rf $(find -name "*.pyc")
27 | }
28 | 
29 | # run backup
30 | echo "Backing up to log dir: $OUTPUT_DIR"
31 | rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR
32 | echo " ...Done"
33 | 
34 | # tar src to avoid future editing
35 | cleanup() {
36 |   echo "Packing source code"
37 |   rmpyc
38 |   # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
39 |   echo " ...Done"
40 | }
41 | 
42 | args=$(cat $1)
43 | 
44 | pushd $OUTPUT_DIR
45 | trap cleanup EXIT
46 | 
47 | # log git status
48 | echo "Logging git status"
49 | git status > git_status
50 | git rev-parse HEAD > git_tag
51 | git diff > git_diff
52 | echo $PY_ARGS > desc
53 | echo " ...Done"
54 | 
55 | python -m torch.distributed.launch --nproc_per_node=4   --master_port 29504 --use_env main.py ${args} --output_dir $OUTPUT_DIR |& tee -a output.log
56 | 


--------------------------------------------------------------------------------
/tools/clip_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import numpy as np
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | root_data = 'data/MOT/MOT17_all/train'
 8 | vids = os.listdir(root_data)
 9 | 
10 | for v in vids:
11 |     if 'SDP' in v:
12 |         labels_full = defaultdict(list)
13 |         gt_path = os.path.join(root_data, v, 'gt', 'gt.txt')
14 |         for l in open(gt_path):
15 |             t, i, *xywh = l.strip().split(',')
16 |             labels_full[int(t)].append([i, *xywh])
17 |         imgs_root = os.path.join(root_data, v, 'img1')
18 |         imgs_path = sorted(os.listdir(imgs_root))
19 |         
20 |         for ith, img_p in enumerate(imgs_path):
21 |             if ith < (len(imgs_path)+1)//2:
22 |                 save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17')
23 |                 save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17')
24 |                 print('train: %d', save_img)
25 |             else:
26 |                 save_img = os.path.join(imgs_root, img_p).replace('MOT17_all', 'MOT17').replace('train', 'val')
27 |                 save_label = os.path.join(root_data, v, 'gt', 'gt.txt').replace('MOT17_all', 'MOT17').replace('train', 'val')
28 |                 print('val: %d', save_img)
29 |             os.makedirs(os.path.dirname(save_label), exist_ok=True)
30 |             with open(save_label, 'a+') as f:
31 |                 if ith+1 in labels_full:
32 |                     for l in labels_full[ith+1]:
33 |                         f.write('%d,%s,%s,%s,%s,%s,%s,%s,%s\n'%(ith+1, *l))
34 |             img = cv2.imread(os.path.join(imgs_root, img_p))
35 |             os.makedirs(os.path.dirname(save_img), exist_ok=True)
36 |             cv2.imwrite(save_img, img)


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | from .dance import build as build_e2e_dance
12 | from .dance_test import build as build_e2e_dance_test
13 | from .tao import build as build_e2e_tao
14 | from .joint import build as build_e2e_joint
15 | from .mot import build as build_e2e_mot
16 | from .all import build as build_e2e_all
17 | from .bdd100k import build as build_e2e_bdd
18 | from .bdd100kcoco import build as build_e2e_bddcc
19 | 
20 | 
21 | def build_dataset(image_set, args):
22 |     if args.dataset_file == 'e2e_joint':
23 |         return build_e2e_joint(image_set, args)
24 |     elif args.dataset_file == 'e2e_dance':
25 |         return build_e2e_dance(image_set, args)
26 |     elif args.dataset_file == 'e2e_dance_test':
27 |         return build_e2e_dance_test(image_set, args)
28 |     elif args.dataset_file == 'e2e_all':
29 |         return build_e2e_all(image_set, args)
30 |     elif args.dataset_file == 'e2e_bdd':
31 |         return build_e2e_bdd(image_set, args)
32 |     elif args.dataset_file == 'e2e_tao':
33 |         return build_e2e_tao(image_set, args)
34 |     elif args.dataset_file == 'e2e_bddcc':
35 |         return build_e2e_bddcc(image_set, args)
36 |     elif args.dataset_file == 'e2e_mot':
37 |         return build_e2e_mot(image_set, args)
38 |     raise ValueError(f'dataset {args.dataset_file} not supported')
39 | 


--------------------------------------------------------------------------------
/util/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from pytorch-checkpoint (https://github.com/csrhddlam/pytorch-checkpoint)
 5 | # ------------------------------------------------------------------------
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def check_require_grad(t):
11 |     return isinstance(t, torch.Tensor) and t.requires_grad
12 | 
13 | 
14 | class CheckpointFunction(torch.autograd.Function):
15 |     @staticmethod
16 |     def forward(ctx, run_function, length, *args):
17 |         ctx.run_function = run_function
18 |         ctx.input_tensors = list(args[:length])
19 |         ctx.input_params = list(args[length:])
20 |         with torch.no_grad():
21 |             output_tensors = ctx.run_function(*ctx.input_tensors)
22 |         return output_tensors
23 | 
24 |     @staticmethod
25 |     def backward(ctx, *output_grads):
26 |         for i in range(len(ctx.input_tensors)):
27 |             temp = ctx.input_tensors[i]
28 |             if check_require_grad(temp):
29 |                 ctx.input_tensors[i] = temp.detach()
30 |                 ctx.input_tensors[i].requires_grad = temp.requires_grad
31 |         with torch.enable_grad():
32 |             output_tensors = ctx.run_function(*ctx.input_tensors)
33 |         to_autograd = list(filter(check_require_grad, ctx.input_tensors))
34 |         output_tensors, output_grads = zip(*filter(lambda t: t[0].requires_grad, zip(output_tensors, output_grads)))
35 |         input_grads = torch.autograd.grad(output_tensors, to_autograd + ctx.input_params, output_grads, allow_unused=True)
36 |         input_grads = list(input_grads)
37 |         for i in range(len(ctx.input_tensors)):
38 |             if not check_require_grad(ctx.input_tensors[i]):
39 |                 input_grads.insert(i, None)
40 |         return (None, None) + tuple(input_grads)
41 | 


--------------------------------------------------------------------------------
/models/registry.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Yihao Chen
 3 | # @Date:   2021-08-16 16:03:17
 4 | # @Last Modified by:   Shilong Liu
 5 | # @Last Modified time: 2022-01-23 15:26
 6 | # modified from mmcv
 7 | 
 8 | import inspect
 9 | from functools import partial
10 | 
11 | 
12 | class Registry(object):
13 | 
14 |     def __init__(self, name):
15 |         self._name = name
16 |         self._module_dict = dict()
17 | 
18 |     def __repr__(self):
19 |         format_str = self.__class__.__name__ + '(name={}, items={})'.format(
20 |             self._name, list(self._module_dict.keys()))
21 |         return format_str
22 | 
23 |     def __len__(self):
24 |         return len(self._module_dict)
25 | 
26 |     @property
27 |     def name(self):
28 |         return self._name
29 | 
30 |     @property
31 |     def module_dict(self):
32 |         return self._module_dict
33 | 
34 |     def get(self, key):
35 |         return self._module_dict.get(key, None)
36 | 
37 |     def registe_with_name(self, module_name=None, force=False):
38 |         return partial(self.register, module_name=module_name, force=force)
39 | 
40 |     def register(self, module_build_function, module_name=None, force=False):
41 |         """Register a module build function.
42 |         Args:
43 |             module (:obj:`nn.Module`): Module to be registered.
44 |         """
45 |         if not inspect.isfunction(module_build_function):
46 |             raise TypeError('module_build_function must be a function, but got {}'.format(
47 |                 type(module_build_function)))
48 |         if module_name is None:
49 |             module_name = module_build_function.__name__
50 |         if not force and module_name in self._module_dict:
51 |             raise KeyError('{} is already registered in {}'.format(
52 |                 module_name, self.name))
53 |         self._module_dict[module_name] = module_build_function
54 | 
55 |         return module_build_function
56 | 
57 | MODULE_BUILD_FUNCS = Registry('model build functions')
58 | 
59 | 


--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/models/dino/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | import json
13 | import os
14 | 
15 | import util.misc as utils
16 | 
17 | try:
18 |     from panopticapi.evaluation import pq_compute
19 | except ImportError:
20 |     pass
21 | 
22 | 
23 | class PanopticEvaluator(object):
24 |     def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
25 |         self.gt_json = ann_file
26 |         self.gt_folder = ann_folder
27 |         if utils.is_main_process():
28 |             if not os.path.exists(output_dir):
29 |                 os.mkdir(output_dir)
30 |         self.output_dir = output_dir
31 |         self.predictions = []
32 | 
33 |     def update(self, predictions):
34 |         for p in predictions:
35 |             with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
36 |                 f.write(p.pop("png_string"))
37 | 
38 |         self.predictions += predictions
39 | 
40 |     def synchronize_between_processes(self):
41 |         all_predictions = utils.all_gather(self.predictions)
42 |         merged_predictions = []
43 |         for p in all_predictions:
44 |             merged_predictions += p
45 |         self.predictions = merged_predictions
46 | 
47 |     def summarize(self):
48 |         if utils.is_main_process():
49 |             json_data = {"annotations": self.predictions}
50 |             predictions_json = os.path.join(self.output_dir, "predictions.json")
51 |             with open(predictions_json, "w") as f:
52 |                 f.write(json.dumps(json_data))
53 |             return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
54 |         return None
55 | 


--------------------------------------------------------------------------------
/tools/merge_dance_tracklets.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | 
 5 | 
 6 | import argparse
 7 | from collections import defaultdict
 8 | import os
 9 | from pathlib import Path
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('input_dir', type=Path)
13 | parser.add_argument('output_dir', type=Path)
14 | parser.add_argument('--t_min', default=20)
15 | parser.add_argument('--t_max', default=100)
16 | args = parser.parse_args()
17 | 
18 | 
19 | class FindUnionSet(dict):
20 |     def find(self, src):
21 |         if src in self:
22 |             return self.find(self[src])
23 |         return src
24 | 
25 |     def merge(self, dst, src):
26 |         self[self.find(src)] = self.find(dst)
27 | 
28 | 
29 | for seq in os.listdir(args.input_dir):
30 |     print(args.input_dir / seq)
31 |     with open(args.input_dir / seq) as f:
32 |         lines = f.readlines()
33 |     instance_timestamps = defaultdict(list)
34 |     for line in lines:
35 |         f_id, id = map(int, line.split(',')[:2])
36 |         instance_timestamps[id].append(f_id)
37 |     instances = list(instance_timestamps.keys())
38 |     fid_map = FindUnionSet()
39 |     for i in instances:
40 |         for j in instances:
41 |             if fid_map.find(i) == fid_map.find(j):
42 |                 continue
43 |             end_t = max(instance_timestamps[i])
44 |             start_t = min(instance_timestamps[j])
45 |             if sum([0 <= start_t - max(pts) < args.t_max for pts in instance_timestamps.values()]) > 1:
46 |                 continue
47 |             if sum([0 <= min(pts) - end_t < args.t_max for pts in instance_timestamps.values()]) > 1:
48 |                 continue
49 |             dt = start_t - end_t
50 |             if args.t_min < dt < args.t_max:
51 |                 print(f"{i}<-{j}", end_t, start_t, start_t - end_t)
52 |                 fid_map.merge(i, j)
53 | 
54 |     os.makedirs(args.output_dir / 'tracker', exist_ok=True)
55 |     with open(args.output_dir / 'tracker' / seq, 'w') as f:
56 |         for line in lines:
57 |             f_id, id, *info = line.split(',')
58 |             id = str(fid_map.find(int(id)))
59 |             f.write(','.join([f_id, id, *info]))
60 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .deformable_transformer_plus import DeformableTransformer
11 | from .deformable_transformer_cross import DeformableTransformer as DeformableTransformerCross
12 | from .ftransformer import DetrTransformerDecoder
13 | def build_deforamble_transformer(args):
14 |     arch_catalog = {
15 |         'DeformableTransformer': DeformableTransformer,
16 |         'DeformableTransformerCross': DeformableTransformerCross,
17 |     }
18 |     assert args.trans_mode in arch_catalog, 'invalid arch: {}'.format(args.trans_mode)
19 |     build_func = arch_catalog[args.trans_mode]
20 |     
21 |     return build_func(
22 |         d_model=args.hidden_dim,
23 |         nhead=args.nheads,
24 |         num_encoder_layers=args.enc_layers,
25 |         num_decoder_layers=args.dec_layers,
26 |         dim_feedforward=args.dim_feedforward,
27 |         dropout=args.dropout,
28 |         activation="relu",
29 |         return_intermediate_dec=True,
30 |         num_feature_levels=args.num_feature_levels,
31 |         dec_n_points=args.dec_n_points,
32 |         enc_n_points=args.enc_n_points,
33 |         two_stage=args.two_stage,
34 |         two_stage_num_proposals=args.num_queries,
35 |         decoder_self_cross=not args.decoder_cross_self,
36 |         sigmoid_attn=args.sigmoid_attn,
37 |         extra_track_attn=args.extra_track_attn,
38 |         memory_bank=args.memory_bank_type == 'MemoryBankFeat'
39 |     )
40 |     
41 | 
42 | from .motr import build as build_motr
43 | from .motr_uninC import build as build_motr_uninC
44 | from .motr_uninCost import build as build_motr_uninCost
45 | 
46 | 
47 | from .tmotr_uni import build as build_tmotr_uni
48 | 
49 | def build_model(args):
50 |     arch_catalog = {
51 |         'motr': build_motr,
52 |         'motr_uninc': build_motr_uninC,
53 |         'motr_unincost': build_motr_uninCost,
54 |         'tmotr_uni': build_tmotr_uni,
55 |     }
56 |     assert args.meta_arch in arch_catalog, 'invalid arch: {}'.format(args.meta_arch)
57 |     build_func = arch_catalog[args.meta_arch]
58 |     return build_func(args)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/models/yolo_fpn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from .darknet import Darknet
 9 | from .network_blocks import BaseConv
10 | 
11 | 
12 | class YOLOFPN(nn.Module):
13 |     """
14 |     YOLOFPN module. Darknet 53 is the default backbone of this model.
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         depth=53,
20 |         in_features=["dark3", "dark4", "dark5"],
21 |     ):
22 |         super().__init__()
23 | 
24 |         self.backbone = Darknet(depth)
25 |         self.in_features = in_features
26 | 
27 |         # out 1
28 |         self.out1_cbl = self._make_cbl(512, 256, 1)
29 |         self.out1 = self._make_embedding([256, 512], 512 + 256)
30 | 
31 |         # out 2
32 |         self.out2_cbl = self._make_cbl(256, 128, 1)
33 |         self.out2 = self._make_embedding([128, 256], 256 + 128)
34 | 
35 |         # upsample
36 |         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
37 | 
38 |     def _make_cbl(self, _in, _out, ks):
39 |         return BaseConv(_in, _out, ks, stride=1, act="lrelu")
40 | 
41 |     def _make_embedding(self, filters_list, in_filters):
42 |         m = nn.Sequential(
43 |             *[
44 |                 self._make_cbl(in_filters, filters_list[0], 1),
45 |                 self._make_cbl(filters_list[0], filters_list[1], 3),
46 |                 self._make_cbl(filters_list[1], filters_list[0], 1),
47 |                 self._make_cbl(filters_list[0], filters_list[1], 3),
48 |                 self._make_cbl(filters_list[1], filters_list[0], 1),
49 |             ]
50 |         )
51 |         return m
52 | 
53 |     def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
54 |         with open(filename, "rb") as f:
55 |             state_dict = torch.load(f, map_location="cpu")
56 |         print("loading pretrained weights...")
57 |         self.backbone.load_state_dict(state_dict)
58 | 
59 |     def forward(self, inputs):
60 |         """
61 |         Args:
62 |             inputs (Tensor): input image.
63 | 
64 |         Returns:
65 |             Tuple[Tensor]: FPN output features..
66 |         """
67 |         #  backbone
68 |         out_features = self.backbone(inputs)
69 |         x2, x1, x0 = [out_features[f] for f in self.in_features]
70 | 
71 |         #  yolo branch 1
72 |         x1_in = self.out1_cbl(x0)
73 |         x1_in = self.upsample(x1_in)
74 |         x1_in = torch.cat([x1_in, x1], 1)
75 |         out_dark4 = self.out1(x1_in)
76 | 
77 |         #  yolo branch 2
78 |         x2_in = self.out2_cbl(out_dark4)
79 |         x2_in = self.upsample(x2_in)
80 |         x2_in = torch.cat([x2_in, x2], 1)
81 |         out_dark3 = self.out2(x2_in)
82 | 
83 |         outputs = (out_dark3, out_dark4, x0)
84 |         return outputs
85 | 


--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/models/dino/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 | 
37 | 
38 |     if torch.cuda.is_available() and CUDA_HOME is not None:
39 |         extension = CUDAExtension
40 |         sources += source_cuda
41 |         define_macros += [("WITH_CUDA", None)]
42 |         extra_compile_args["nvcc"] = [
43 |             "-DCUDA_HAS_FP16=1",
44 |             "-D__CUDA_NO_HALF_OPERATORS__",
45 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
46 |             "-D__CUDA_NO_HALF2_OPERATORS__",
47 |         ]
48 |     else:
49 |         raise NotImplementedError('Cuda is not availabel')
50 | 
51 |     sources = [os.path.join(extensions_dir, s) for s in sources]
52 |     include_dirs = [extensions_dir]
53 |     ext_modules = [
54 |         extension(
55 |             "MultiScaleDeformableAttention",
56 |             sources,
57 |             include_dirs=include_dirs,
58 |             define_macros=define_macros,
59 |             extra_compile_args=extra_compile_args,
60 |         )
61 |     ]
62 |     return ext_modules
63 | 
64 | setup(
65 |     name="MultiScaleDeformableAttention",
66 |     version="1.0",
67 |     author="Weijie Su",
68 |     url="https://github.com/fundamentalvision/Deformable-DETR",
69 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
70 |     packages=find_packages(exclude=("configs", "tests",)),
71 |     ext_modules=get_extensions(),
72 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
73 | )
74 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // 使用 IntelliSense 了解相关属性。 
 3 |     // 悬停以查看现有属性的描述。
 4 |     // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": true,
14 |             "env": {"CUDA_VISIBLE_DEVICES":"0", "CUBLAS_WORKSPACE_CONFIG":":4096:8"},
15 |             // "args": ["--meta_arch", "motr_unincost", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "60", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "gmatch", "--g_size", "3", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/exps/motrv2ch_uni5cost3ggoon/run1/checkpoint.pth"]
16 | 
17 |             // "args": ["--meta_arch", "dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/exps/motrv2ch_uni5cost6g/run2/checkpoint.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048", "--resume", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth"]
18 |             "args": ["--meta_arch", "mot_dino", "--dataset_file", "e2e_dance", "--epoch", "20", "--with_box_refine", "--lr_drop", "8", "--lr", "2e-4", "--lr_backbone", "2e-5", "--pretrained", "/home/hadoop-vacv/yanfeng/project/MOTRv2/CO-MOT/checkpoints/dino_0031_5scale.pth", "--batch_size", "2", "--sample_mode", "random_interval", "--sample_interval", "10", "--sampler_lengths", "5", "--merger_dropout", "0", "--dropout", "0", "--random_drop", "0.1", "--fp_ratio", "0.3", "--query_interaction_layer", "GQIM", "--num_queries", "900", "--append_crowd", "--use_checkpoint", "--mot_path", "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-vacv/yanfeng/data/", "--match_type", "HungarianMatcher", "--g_size", "1", "--num_feature_levels", "5", "--dim_feedforward", "2048"] 
19 |         }
20 |     ]
21 | }


--------------------------------------------------------------------------------
/models/losses.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | class IOUloss(nn.Module):
11 |     def __init__(self, reduction="none", loss_type="iou"):
12 |         super(IOUloss, self).__init__()
13 |         self.reduction = reduction
14 |         self.loss_type = loss_type
15 | 
16 |     def forward(self, pred, target):
17 |         assert pred.shape[0] == target.shape[0]
18 | 
19 |         pred = pred.view(-1, 4)
20 |         target = target.view(-1, 4)
21 |         tl = torch.max(
22 |             (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
23 |         )
24 |         br = torch.min(
25 |             (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
26 |         )
27 | 
28 |         area_p = torch.prod(pred[:, 2:], 1)
29 |         area_g = torch.prod(target[:, 2:], 1)
30 | 
31 |         en = (tl < br).type(tl.type()).prod(dim=1)
32 |         area_i = torch.prod(br - tl, 1) * en
33 |         iou = (area_i) / (area_p + area_g - area_i + 1e-16)
34 | 
35 |         if self.loss_type == "iou":
36 |             loss = 1 - iou ** 2
37 |         elif self.loss_type == "giou":
38 |             c_tl = torch.min(
39 |                 (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
40 |             )
41 |             c_br = torch.max(
42 |                 (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
43 |             )
44 |             area_c = torch.prod(c_br - c_tl, 1)
45 |             giou = iou - (area_c - area_i) / area_c.clamp(1e-16)
46 |             loss = 1 - giou.clamp(min=-1.0, max=1.0)
47 | 
48 |         if self.reduction == "mean":
49 |             loss = loss.mean()
50 |         elif self.reduction == "sum":
51 |             loss = loss.sum()
52 | 
53 |         return loss
54 | 
55 | 
56 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
57 |     """
58 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
59 |     Args:
60 |         inputs: A float tensor of arbitrary shape.
61 |                 The predictions for each example.
62 |         targets: A float tensor with the same shape as inputs. Stores the binary
63 |                  classification label for each element in inputs
64 |                 (0 for the negative class and 1 for the positive class).
65 |         alpha: (optional) Weighting factor in range (0,1) to balance
66 |                 positive vs negative examples. Default = -1 (no weighting).
67 |         gamma: Exponent of the modulating factor (1 - p_t) to
68 |                balance easy vs hard examples.
69 |     Returns:
70 |         Loss tensor
71 |     """
72 |     prob = inputs.sigmoid()
73 |     ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
74 |     p_t = prob * targets + (1 - prob) * (1 - targets)
75 |     loss = ce_loss * ((1 - p_t) ** gamma)
76 | 
77 |     if alpha >= 0:
78 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
79 |         loss = alpha_t * loss
80 |     #return loss.mean(0).sum() / num_boxes
81 |     return loss.sum() / num_boxes


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | """
13 | Utilities for bounding box manipulation and GIoU.
14 | """
15 | import torch
16 | from torchvision.ops.boxes import box_area
17 | 
18 | 
19 | def box_cxcywh_to_xyxy(x):
20 |     x_c, y_c, w, h = x.unbind(-1)
21 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
22 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
23 |     return torch.stack(b, dim=-1)
24 | 
25 | 
26 | def box_xyxy_to_cxcywh(x):
27 |     x0, y0, x1, y1 = x.unbind(-1)
28 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
29 |          (x1 - x0), (y1 - y0)]
30 |     return torch.stack(b, dim=-1)
31 | 
32 | 
33 | # modified from torchvision to also return the union
34 | def box_iou(boxes1, boxes2):
35 |     area1 = box_area(boxes1)
36 |     area2 = box_area(boxes2)
37 | 
38 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
39 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
40 | 
41 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
42 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
43 | 
44 |     union = area1[:, None] + area2 - inter
45 | 
46 |     iou = inter / union
47 |     return iou, union
48 | 
49 | 
50 | def generalized_box_iou(boxes1, boxes2):
51 |     """
52 |     Generalized IoU from https://giou.stanford.edu/
53 | 
54 |     The boxes should be in [x0, y0, x1, y1] format
55 | 
56 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
57 |     and M = len(boxes2)
58 |     """
59 |     # degenerate boxes gives inf / nan results
60 |     # so do an early check
61 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
62 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
63 |     iou, union = box_iou(boxes1, boxes2)
64 | 
65 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
66 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
67 | 
68 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
69 |     area = wh[:, :, 0] * wh[:, :, 1]
70 | 
71 |     return iou - (area - union) / area
72 | 
73 | 
74 | def masks_to_boxes(masks):
75 |     """Compute the bounding boxes around the provided masks
76 | 
77 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
78 | 
79 |     Returns a [N, 4] tensors, with the boxes in xyxy format
80 |     """
81 |     if masks.numel() == 0:
82 |         return torch.zeros((0, 4), device=masks.device)
83 | 
84 |     h, w = masks.shape[-2:]
85 | 
86 |     y = torch.arange(0, h, dtype=torch.float)
87 |     x = torch.arange(0, w, dtype=torch.float)
88 |     y, x = torch.meshgrid(y, x)
89 | 
90 |     x_mask = (masks * x.unsqueeze(0))
91 |     x_max = x_mask.flatten(1).max(-1)[0]
92 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
93 | 
94 |     y_mask = (masks * y.unsqueeze(0))
95 |     y_max = y_mask.flatten(1).max(-1)[0]
96 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
97 | 
98 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
99 | 


--------------------------------------------------------------------------------
/models/dino/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | import MultiScaleDeformableAttention as MSDA
22 | 
23 | 
24 | class MSDeformAttnFunction(Function):
25 |     @staticmethod
26 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 |         ctx.im2col_step = im2col_step
28 |         output = MSDA.ms_deform_attn_forward(
29 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 |         return output
32 | 
33 |     @staticmethod
34 |     @once_differentiable
35 |     def backward(ctx, grad_output):
36 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 |         grad_value, grad_sampling_loc, grad_attn_weight = \
38 |             MSDA.ms_deform_attn_backward(
39 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 | 
41 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 | 
43 | 
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 |     # for debug and test only,
46 |     # need to use cuda version instead
47 |     N_, S_, M_, D_ = value.shape   # batch, pixel, multi head, channel
48 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape   # batch, pixel, multi head,  n_levels, n_points, 2 
49 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)  # 按level拆分value
50 |     sampling_grids = 2 * sampling_locations - 1  # [-1,1]
51 |     sampling_value_list = []
52 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 |         # N_*M_, D_, Lq_, P_
58 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
60 |         sampling_value_list.append(sampling_value_l_)
61 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 |     return output.transpose(1, 2).contiguous()
65 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | import MultiScaleDeformableAttention as MSDA
22 | 
23 | 
24 | class MSDeformAttnFunction(Function):
25 |     @staticmethod
26 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 |         ctx.im2col_step = im2col_step
28 |         output = MSDA.ms_deform_attn_forward(
29 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 |         return output
32 | 
33 |     @staticmethod
34 |     @once_differentiable
35 |     def backward(ctx, grad_output):
36 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 |         grad_value, grad_sampling_loc, grad_attn_weight = \
38 |             MSDA.ms_deform_attn_backward(
39 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 | 
41 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 | 
43 | 
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 |     # for debug and test only,
46 |     # need to use cuda version instead
47 |     N_, S_, M_, D_ = value.shape   # batch, pixel, multi head, channel
48 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape   # batch, pixel, multi head,  n_levels, n_points, 2 
49 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)  # 按level拆分value
50 |     sampling_grids = 2 * sampling_locations - 1  # [-1,1]
51 |     sampling_value_list = []
52 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 |         # N_*M_, D_, Lq_, P_
58 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
60 |         sampling_value_list.append(sampling_value_l_)
61 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 |     return output.transpose(1, 2).contiguous()
65 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-37/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | import MultiScaleDeformableAttention as MSDA
22 | 
23 | 
24 | class MSDeformAttnFunction(Function):
25 |     @staticmethod
26 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 |         ctx.im2col_step = im2col_step
28 |         output = MSDA.ms_deform_attn_forward(
29 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 |         return output
32 | 
33 |     @staticmethod
34 |     @once_differentiable
35 |     def backward(ctx, grad_output):
36 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 |         grad_value, grad_sampling_loc, grad_attn_weight = \
38 |             MSDA.ms_deform_attn_backward(
39 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 | 
41 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 | 
43 | 
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 |     # for debug and test only,
46 |     # need to use cuda version instead
47 |     N_, S_, M_, D_ = value.shape   # batch, pixel, multi head, channel
48 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape   # batch, pixel, multi head,  n_levels, n_points, 2 
49 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)  # 按level拆分value
50 |     sampling_grids = 2 * sampling_locations - 1  # [-1,1]
51 |     sampling_value_list = []
52 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 |         # N_*M_, D_, Lq_, P_
58 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
60 |         sampling_value_list.append(sampling_value_l_)
61 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 |     return output.transpose(1, 2).contiguous()
65 | 


--------------------------------------------------------------------------------
/tools/train_ddp.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # ------------------------------------------------------------------------
  3 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  4 | # ------------------------------------------------------------------------
  5 | 
  6 | # 打印所有指令
  7 | set -x
  8 | 
  9 | PY_ARGS=${@:2}
 10 | 
 11 | # 脚本运行失败，报错
 12 | set -o pipefail
 13 | #sed -e  ：直接在指令列模式上進行 sed 的動作編輯；
 14 | OUTPUT_BASE=$(echo $1 | sed -e "s/configs/exps/g" | sed -e "s/.args$//g")
 15 | mkdir -p $OUTPUT_BASE
 16 | 
 17 | 
 18 | cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
 19 | echo "cluster spec is $cluster_spec"
 20 | worker_list_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"worker\"))"
 21 | echo "worker list command is $worker_list_command"
 22 | eval worker_list=`python -c "$worker_list_command"`
 23 | echo "worker list is $worker_list"
 24 | worker_strs=(${worker_list//,/ })
 25 | master=${worker_strs[0]}
 26 | echo "master is $master"
 27 | master_strs=(${master//:/ })
 28 | master_addr=${master_strs[0]}
 29 | master_port=${master_strs[1]}
 30 | echo "master address is $master_addr"
 31 | echo "master port is $master_port"
 32 | index_command="import util.json_parser as json_parser;print(json_parser.parse(\"$cluster_spec\", \"index\"))"
 33 | eval node_rank=`python -c "$index_command"`
 34 | echo "node rank is $node_rank"
 35 | dist_url="tcp://$master_addr:$master_port"
 36 | echo "dist url is $dist_url"
 37 | PYTHONPATH=$PYTHONPATH:../ \
 38 | # python tools/run_net.py \
 39 | #    --num_shards 8 \
 40 | #    --shard_id $node_rank \
 41 | #    --dist_url $dist_url \
 42 | #    --cfg configs/verb/MVIT_B_32x2_CONV.yaml
 43 | 
 44 | MASTER_ADDR=${MASTER_ADDR:-$master_addr}
 45 | MASTER_PORT=${MASTER_PORT:-$master_port}
 46 | NODE_RANK=${NODE_RANK:-$node_rank}
 47 | # let "NNODES=GPUS/GPUS_PER_NODE"
 48 | 
 49 | NODE_NUM=${#worker_strs[@]}  
 50 | echo "node num is $NODE_NUM"
 51 | 
 52 | if ((NODE_RANK == 0)); then
 53 |   for RUN in $(seq 100); do
 54 |     ls $OUTPUT_BASE | grep run$RUN && continue
 55 |     OUTPUT_DIR=$OUTPUT_BASE/run$RUN
 56 |     mkdir $OUTPUT_DIR && break
 57 |   done
 58 | 
 59 |   # clean up *.pyc files
 60 |   rmpyc() {
 61 |     rm -rf $(find -name __pycache__)
 62 |     rm -rf $(find -name "*.pyc")
 63 |   }
 64 | 
 65 |   # run backup
 66 |   echo "Backing up to log dir: $OUTPUT_DIR"
 67 |   rmpyc && cp -r models datasets util main.py engine.py eval_detr.py seqmap submit_dance.py $1 $OUTPUT_DIR
 68 |   echo " ...Done"
 69 | 
 70 |   # tar src to avoid future editing
 71 |   cleanup() {
 72 |     echo "Packing source code"
 73 |     rmpyc
 74 |     # tar -zcf models datasets util main.py engine.py eval.py submit.py --remove-files
 75 |     echo " ...Done"
 76 |   }
 77 | 
 78 |   pushd $OUTPUT_DIR
 79 |   trap cleanup EXIT
 80 | 
 81 |   # log git status
 82 |   echo "Logging git status"
 83 |   git status > git_status
 84 |   git rev-parse HEAD > git_tag
 85 |   git diff > git_diff
 86 |   echo $PY_ARGS > desc
 87 |   echo " ...Done"
 88 | 
 89 | else
 90 |   # 3 minutes
 91 |   sleep 180
 92 |   for RUN in $(seq 100); do
 93 |     ls $OUTPUT_BASE | grep run$RUN && continue
 94 |     let "ITERRUN=$RUN-1"
 95 |     OUTPUT_DIR=$OUTPUT_BASE/run$ITERRUN
 96 |     break
 97 |   done
 98 | fi
 99 | 
100 | args=$(cat $1)
101 | 
102 | # python -m torch.distributed.launch --nproc_per_node=8   --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR
103 | 
104 | # python ./util/launch.py \
105 | #     --nnodes 2 \
106 | #     --node_rank ${NODE_RANK} \
107 | #     --master_addr ${MASTER_ADDR} \
108 | #     --master_port 29502 \
109 | #     --nproc_per_node 8 \
110 | #     python main.py "${args} --output_dir $OUTPUT_DIR"
111 | python -m torch.distributed.launch --nproc_per_node=8 --nnodes ${NODE_NUM} --node_rank ${NODE_RANK} --master_addr=${MASTER_ADDR} --master_port 29502 --use_env main.py ${args} --output_dir $OUTPUT_DIR
112 | 


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-37/build.ninja:
--------------------------------------------------------------------------------
 1 | ninja_required_version = 1.3
 2 | cxx = c++
 3 | nvcc = /usr/local/cuda/bin/nvcc
 4 | 
 5 | cflags = -pthread -B /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/software/anaconda3/envs/detr/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c
 6 | post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
 7 | cuda_cflags = -DWITH_CUDA -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/TH -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/ganyiyang/software/Anaconda/envs/detr_yf/include/python3.7m -c
 8 | cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_70,code=sm_70 -std=c++14
 9 | ldflags = 
10 | 
11 | rule compile
12 |   command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
13 |   depfile = $out.d
14 |   deps = gcc
15 | 
16 | rule cuda_compile
17 |   command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
18 | 
19 | 
20 | 
21 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cpu/ms_deform_attn_cpu.cpp
22 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.o: cuda_compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/cuda/ms_deform_attn_cuda.cu
23 | build /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/build/temp.linux-x86_64-cpython-37/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.o: compile /mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/yanfeng/project/MOTRv2/MOTRv3/models/ops/src/vision.cpp
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/models/yolo_pafpn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | from .darknet import CSPDarknet
  9 | from .network_blocks import BaseConv, CSPLayer, DWConv
 10 | 
 11 | 
 12 | class YOLOPAFPN(nn.Module):
 13 |     """
 14 |     YOLOv3 model. Darknet 53 is the default backbone of this model.
 15 |     """
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         depth=1.0,
 20 |         width=1.0,
 21 |         in_features=("dark3", "dark4", "dark5"),
 22 |         in_channels=[256, 512, 1024],
 23 |         depthwise=False,
 24 |         act="silu",
 25 |     ):
 26 |         super().__init__()
 27 |         self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
 28 |         self.in_features = in_features
 29 |         self.in_channels = in_channels
 30 |         Conv = DWConv if depthwise else BaseConv
 31 | 
 32 |         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
 33 |         self.lateral_conv0 = BaseConv(
 34 |             int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
 35 |         )
 36 |         self.C3_p4 = CSPLayer(
 37 |             int(2 * in_channels[1] * width),
 38 |             int(in_channels[1] * width),
 39 |             round(3 * depth),
 40 |             False,
 41 |             depthwise=depthwise,
 42 |             act=act,
 43 |         )  # cat
 44 | 
 45 |         self.reduce_conv1 = BaseConv(
 46 |             int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
 47 |         )
 48 |         self.C3_p3 = CSPLayer(
 49 |             int(2 * in_channels[0] * width),
 50 |             int(in_channels[0] * width),
 51 |             round(3 * depth),
 52 |             False,
 53 |             depthwise=depthwise,
 54 |             act=act,
 55 |         )
 56 | 
 57 |         # bottom-up conv
 58 |         self.bu_conv2 = Conv(
 59 |             int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
 60 |         )
 61 |         self.C3_n3 = CSPLayer(
 62 |             int(2 * in_channels[0] * width),
 63 |             int(in_channels[1] * width),
 64 |             round(3 * depth),
 65 |             False,
 66 |             depthwise=depthwise,
 67 |             act=act,
 68 |         )
 69 | 
 70 |         # bottom-up conv
 71 |         self.bu_conv1 = Conv(
 72 |             int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
 73 |         )
 74 |         self.C3_n4 = CSPLayer(
 75 |             int(2 * in_channels[1] * width),
 76 |             int(in_channels[2] * width),
 77 |             round(3 * depth),
 78 |             False,
 79 |             depthwise=depthwise,
 80 |             act=act,
 81 |         )
 82 | 
 83 |     def forward(self, input):
 84 |         """
 85 |         Args:
 86 |             inputs: input images.
 87 | 
 88 |         Returns:
 89 |             Tuple[Tensor]: FPN feature.
 90 |         """
 91 | 
 92 |         #  backbone
 93 |         out_features = self.backbone(input)
 94 |         features = [out_features[f] for f in self.in_features]
 95 |         [x2, x1, x0] = features
 96 | 
 97 |         fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
 98 |         f_out0 = self.upsample(fpn_out0)  # 512/16
 99 |         f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
100 |         f_out0 = self.C3_p4(f_out0)  # 1024->512/16
101 | 
102 |         fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
103 |         f_out1 = self.upsample(fpn_out1)  # 256/8
104 |         f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
105 |         pan_out2 = self.C3_p3(f_out1)  # 512->256/8
106 | 
107 |         p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
108 |         p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
109 |         pan_out1 = self.C3_n3(p_out1)  # 512->512/16
110 | 
111 |         p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
112 |         p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
113 |         pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
114 | 
115 |         outputs = (pan_out2, pan_out1, pan_out0)
116 |         return outputs
117 | 


--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | """
 13 | Various positional encodings for the transformer.
 14 | """
 15 | import math
 16 | import torch
 17 | from torch import nn
 18 | 
 19 | from util.misc import NestedTensor
 20 | 
 21 | 
 22 | class PositionEmbeddingSine(nn.Module):
 23 |     """
 24 |     This is a more standard version of the position embedding, very similar to the one
 25 |     used by the Attention is all you need paper, generalized to work on images.
 26 |     """
 27 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 28 |         super().__init__()
 29 |         self.num_pos_feats = num_pos_feats
 30 |         self.temperature = temperature
 31 |         self.normalize = normalize
 32 |         if scale is not None and normalize is False:
 33 |             raise ValueError("normalize should be True if scale is passed")
 34 |         if scale is None:
 35 |             scale = 2 * math.pi
 36 |         self.scale = scale
 37 | 
 38 |     def forward(self, tensor_list: NestedTensor):
 39 |         x = tensor_list.tensors
 40 |         mask = tensor_list.mask
 41 |         assert mask is not None
 42 |         not_mask = ~mask
 43 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 44 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 45 |         if self.normalize:
 46 |             eps = 1e-6
 47 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
 48 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 49 | 
 50 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 51 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 52 | 
 53 |         pos_x = x_embed[:, :, :, None] / dim_t
 54 |         pos_y = y_embed[:, :, :, None] / dim_t
 55 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 56 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 57 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 58 |         return pos
 59 | 
 60 | 
 61 | class PositionEmbeddingLearned(nn.Module):
 62 |     """
 63 |     Absolute pos embedding, learned.
 64 |     """
 65 |     def __init__(self, num_pos_feats=256):
 66 |         super().__init__()
 67 |         self.row_embed = nn.Embedding(50, num_pos_feats)
 68 |         self.col_embed = nn.Embedding(50, num_pos_feats)
 69 |         self.reset_parameters()
 70 | 
 71 |     def reset_parameters(self):
 72 |         nn.init.uniform_(self.row_embed.weight)
 73 |         nn.init.uniform_(self.col_embed.weight)
 74 | 
 75 |     def forward(self, tensor_list: NestedTensor):
 76 |         x = tensor_list.tensors
 77 |         h, w = x.shape[-2:]
 78 |         i = torch.arange(w, device=x.device)
 79 |         j = torch.arange(h, device=x.device)
 80 |         x_emb = self.col_embed(i)
 81 |         y_emb = self.row_embed(j)
 82 |         pos = torch.cat([
 83 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
 84 |             y_emb.unsqueeze(1).repeat(1, w, 1),
 85 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
 86 |         return pos
 87 | 
 88 | 
 89 | def build_position_encoding(args):
 90 |     N_steps = args.hidden_dim // 2
 91 |     if args.position_embedding in ('v2', 'sine'):
 92 |         # TODO find a better way of exposing other arguments
 93 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
 94 |     elif args.position_embedding in ('v3', 'learned'):
 95 |         position_embedding = PositionEmbeddingLearned(N_steps)
 96 |     else:
 97 |         raise ValueError(f"not supported {args.position_embedding}")
 98 | 
 99 |     return position_embedding
100 | 


--------------------------------------------------------------------------------
/models/dino/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D, requires_grad=True).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2, requires_grad=True).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P, requires_grad=True).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     
55 |     value = torch.autograd.Variable(value.data, requires_grad=True)  
56 |     sampling_locations = torch.autograd.Variable(sampling_locations.data, requires_grad=True)  
57 |     attention_weights = torch.autograd.Variable(attention_weights.data, requires_grad=True)  
58 |     
59 |     t0 = time.time()
60 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).cpu()
61 |     print( time.time()-t0)
62 |     t0 = time.time()
63 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
64 |     print( time.time()-t0)
65 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
66 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
67 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
68 | 
69 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
70 | 
71 | 
72 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
73 | 
74 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
75 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
76 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
77 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
78 |     im2col_step = 2
79 |     func = MSDeformAttnFunction.apply
80 | 
81 |     value.requires_grad = grad_value
82 |     sampling_locations.requires_grad = grad_sampling_loc
83 |     attention_weights.requires_grad = grad_attn_weight
84 | 
85 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
86 | 
87 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     check_forward_equal_with_pytorch_double()
92 |     check_forward_equal_with_pytorch_float()
93 | 
94 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
95 |         check_gradient_numerical(channels, True, True, True)
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | import torch
 13 | from functools import partial
 14 | from models.structures import Instances
 15 | 
 16 | def to_cuda(samples, targets, device):
 17 |     samples = samples.to(device, non_blocking=True)
 18 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
 19 |     return samples, targets
 20 | 
 21 | 
 22 | def tensor_to_cuda(tensor: torch.Tensor, device):
 23 |     return tensor.to(device)
 24 | 
 25 | 
 26 | def is_tensor_or_instances(data):
 27 |     return isinstance(data, torch.Tensor) or isinstance(data, Instances)
 28 | 
 29 | 
 30 | def data_apply(data, check_func, apply_func):
 31 |     if isinstance(data, dict):
 32 |         for k in data.keys():
 33 |             if check_func(data[k]):
 34 |                 data[k] = apply_func(data[k])
 35 |             elif isinstance(data[k], dict) or isinstance(data[k], list):
 36 |                 data_apply(data[k], check_func, apply_func)
 37 |             else:
 38 |                 raise ValueError()
 39 |     elif isinstance(data, list):
 40 |         for i in range(len(data)):
 41 |             if check_func(data[i]):
 42 |                 data[i] = apply_func(data[i])
 43 |             elif isinstance(data[i], dict) or isinstance(data[i], list):
 44 |                 data_apply(data[i], check_func, apply_func)
 45 |             else:
 46 |                 raise ValueError("invalid type {}".format(type(data[i])))
 47 |     else:
 48 |         raise ValueError("invalid type {}".format(type(data)))
 49 |     return data
 50 | 
 51 | 
 52 | def data_dict_to_cuda(data_dict, device):
 53 |     return data_apply(data_dict, is_tensor_or_instances, partial(tensor_to_cuda, device=device))
 54 | 
 55 | 
 56 | class data_prefetcher():
 57 |     def __init__(self, loader, device, prefetch=True):
 58 |         self.loader = iter(loader)
 59 |         self.prefetch = prefetch
 60 |         self.device = device
 61 |         if prefetch:
 62 |             self.stream = torch.cuda.Stream()
 63 |             self.preload()
 64 | 
 65 |     def preload(self):
 66 |         try:
 67 |             self.next_samples, self.next_targets = next(self.loader)
 68 |         except StopIteration:
 69 |             self.next_samples = None
 70 |             self.next_targets = None
 71 |             return
 72 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
 73 |         # on the main stream.
 74 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
 75 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
 76 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
 77 |         # at the time we start copying to next_*:
 78 |         # self.stream.wait_stream(torch.cuda.current_stream())
 79 |         with torch.cuda.stream(self.stream):
 80 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
 81 |             # more code for the alternative if record_stream() doesn't work:
 82 |             # copy_ will record the use of the pinned source tensor in this side stream.
 83 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
 84 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
 85 |             # self.next_input = self.next_input_gpu
 86 |             # self.next_target = self.next_target_gpu
 87 | 
 88 |             # With Amp, it isn't necessary to manually convert data to half.
 89 |             # if args.fp16:
 90 |             #     self.next_input = self.next_input.half()
 91 |             # else:
 92 | 
 93 |     def next(self):
 94 |         if self.prefetch:
 95 |             torch.cuda.current_stream().wait_stream(self.stream)
 96 |             samples = self.next_samples
 97 |             targets = self.next_targets
 98 |             if samples is not None:
 99 |                 samples.record_stream(torch.cuda.current_stream())
100 |             if targets is not None:
101 |                 for t in targets:
102 |                     for k, v in t.items():
103 |                         v.record_stream(torch.cuda.current_stream())
104 |             self.preload()
105 |         else:
106 |             try:
107 |                 samples, targets = next(self.loader)
108 |                 samples, targets = to_cuda(samples, targets, self.device)
109 |             except StopIteration:
110 |                 print("catch_stop_iter")
111 |                 samples = None
112 |                 targets = None
113 | 
114 |         return samples, targets
115 | 


--------------------------------------------------------------------------------
/util/tool.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | import torch
 12 | import copy
 13 | import numpy as np
 14 | import collections
 15 | 
 16 | def load_model(model, model_path, optimizer=None, resume=False,
 17 |                lr=None, lr_step=None):
 18 |     start_epoch = 0
 19 |     checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
 20 |     print(f'loaded {model_path}')
 21 |     state_dict = checkpoint['model']
 22 |     model_state_dict = model.state_dict()
 23 | 
 24 |     # check loaded parameters and created model parameters
 25 |     msg = 'If you see this, your model does not fully load the ' + \
 26 |           'pre-trained weight. Please make sure ' + \
 27 |           'you set the correct --num_classes for your own dataset.'
 28 |     state_dict_old = copy.deepcopy(state_dict)
 29 |     for k in state_dict_old:
 30 |         if k in model_state_dict:
 31 |             if state_dict[k].shape != model_state_dict[k].shape:
 32 |                 print('Skip loading parameter {}, required shape{}, ' \
 33 |                       'loaded shape{}. {}'.format(
 34 |                     k, model_state_dict[k].shape, state_dict[k].shape, msg))
 35 |                 if 'class_embed' in k:
 36 |                     print("load class_embed: {} shape={}".format(k, state_dict[k].shape))
 37 |                     if model_state_dict[k].shape[0] == 1:
 38 |                         state_dict[k] = state_dict[k][1:2]
 39 |                     elif model_state_dict[k].shape[0] == 2:
 40 |                         state_dict[k] = state_dict[k][1:3]
 41 |                     elif model_state_dict[k].shape[0] == 3:
 42 |                         state_dict[k] = state_dict[k][1:4]
 43 |                     elif model_state_dict[k].shape[0] == 11:
 44 |                         state_dict[k] = state_dict[k][1:12]
 45 |                     elif model_state_dict[k].shape[0] == 100:
 46 |                         state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]]
 47 |                     elif model_state_dict[k].shape[0] == 91 and state_dict[k].shape[0] == 1:
 48 |                         state_dict[k] = state_dict[k].repeat_interleave(91, dim=0)
 49 |                     elif model_state_dict[k].shape[0] == 2000:
 50 |                         state_dict[k] = state_dict[k].repeat_interleave(model_state_dict[k].shape[0]//state_dict[k].shape[0]+1, dim=0)[:model_state_dict[k].shape[0]]
 51 |                     else:
 52 |                         raise NotImplementedError('invalid shape: {}'.format(model_state_dict[k].shape))
 53 |                     continue
 54 |                 state_dict[k] = model_state_dict[k]
 55 |         elif k.replace('in_proj_weight', 'in_proj.weight') in model_state_dict:
 56 |             k_dst = k.replace('in_proj_weight', 'in_proj.weight')
 57 |             print('{}->{}'.format(k, k_dst))
 58 |             state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()])
 59 |         elif k.replace('in_proj_bias', 'in_proj.bias') in model_state_dict:
 60 |             k_dst = k.replace('in_proj_bias', 'in_proj.bias')
 61 |             print('{}->{}'.format(k, k_dst))
 62 |             state_dict = collections.OrderedDict([(k_dst, v) if k_ == k else (k_, v) for k_, v in state_dict.items()])
 63 |         elif 'transformer.decoder.layers' in k and 'self_attn.in_proj' in k:
 64 |             k_dst_q = k.replace('in_proj_', 'in_proj_q.')
 65 |             k_dst_k = k.replace('in_proj_', 'in_proj_k.')
 66 |             k_dst_v = k.replace('in_proj_', 'in_proj_v.')
 67 |             print('{}->({},{},{})'.format(k, k_dst_q, k_dst_k, k_dst_v))
 68 |             state_dict[k_dst_q], state_dict[k_dst_k], state_dict[k_dst_v] = torch.chunk(state_dict[k], 3, dim=0)
 69 |         else:
 70 |             print('Drop parameter {}.'.format(k) + msg)
 71 |     for k in model_state_dict:
 72 |         if not (k in state_dict):  # pretrain model
 73 |             if 'decoder_two' in k:
 74 |                 state_dict[k] = state_dict[k.replace('.decoder_two.', '.decoder.')]
 75 |             elif '_embed_two' in k:
 76 |                 state_dict[k] = state_dict[k.replace('_embed_two.', '_embed.')]
 77 |             else:
 78 |                 print('No param {}.'.format(k) + msg)
 79 |                 state_dict[k] = model_state_dict[k]
 80 |     model.load_state_dict(state_dict, strict=False)
 81 | 
 82 |     # resume optimizer parameters
 83 |     if optimizer is not None and resume:
 84 |         if 'optimizer' in checkpoint:
 85 |             optimizer.load_state_dict(checkpoint['optimizer'])
 86 |             start_epoch = checkpoint['epoch']
 87 |             start_lr = lr
 88 |             for step in lr_step:
 89 |                 if start_epoch >= step:
 90 |                     start_lr *= 0.1
 91 |             for param_group in optimizer.param_groups:
 92 |                 param_group['lr'] = start_lr
 93 |             print('Resumed optimizer with start lr', start_lr)
 94 |         else:
 95 |             print('No optimizer parameters in checkpoint.')
 96 |     if optimizer is not None:
 97 |         return model, optimizer, start_epoch
 98 |     else:
 99 |         return model
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/tools/similarity_analysis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | from sklearn.decomposition import PCA
  5 | 
  6 | 
  7 | # 计算两个box的IOU
  8 | def bboxes_iou(bboxes1,bboxes2):
  9 | 	bboxes1 = np.transpose(bboxes1)
 10 | 	bboxes2 = np.transpose(bboxes2)
 11 | 
 12 | 	# 计算两个box的交集：交集左上角的点取两个box的max，交集右下角的点取两个box的min
 13 | 	int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0])
 14 | 	int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1])
 15 | 	int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2])
 16 | 	int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3])
 17 | 
 18 | 	# 计算两个box交集的wh：如果两个box没有交集，那么wh为0(按照计算方式wh为负数，跟0比较取最大值)
 19 | 	int_h = np.maximum(int_ymax-int_ymin,0.)
 20 | 	int_w = np.maximum(int_xmax-int_xmin,0.)
 21 | 
 22 | 	# 计算IOU
 23 | 	int_vol = int_h * int_w # 交集面积
 24 | 	vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积
 25 | 	vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积
 26 | 	IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集
 27 | 	return IOU
 28 | 
 29 | 
 30 | 
 31 | root_data = 'tmp'
 32 | 
 33 | # det2trk_weight = defaultdict(list)
 34 | # trk2trk_weight = defaultdict(list)
 35 | # detall2trk_weight = defaultdict(list)
 36 | # for i in range(703):
 37 | #     print(i)
 38 | #     for j in range(6):
 39 | 
 40 | #         bboxes = np.load(os.path.join(root_data, 'box_%08d_%d.txt.npy'%(i,j)))[0]
 41 | #         classes = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,j)))[0, :, 0]
 42 | #         weights = np.load(os.path.join(root_data, 'weight_%08d_%d.txt.npy'%(i,j)))
 43 | 
 44 | #         bboxes[:, [0,1]] -= bboxes[:, [2,3]]/2
 45 | #         bboxes[:, [2,3]] += bboxes[:, [0,1]]
 46 | 
 47 | #         indexes = np.where(classes>0)[0]
 48 | 
 49 | #         det_indexes = indexes[indexes<60]
 50 | #         trk_indexes = indexes[indexes>=60]
 51 | 
 52 | #         iou = bboxes_iou(bboxes[trk_indexes], bboxes[det_indexes])
 53 | #         if len(trk_indexes) and len(det_indexes):
 54 | #             pair_idx = iou.argmax(-1)
 55 | #             pair_val = iou.max(-1)
 56 | #             pair_trk_idx = trk_indexes[pair_val>0.7]
 57 | #             pair_det_idx = det_indexes[pair_idx[pair_val>0.7]]
 58 | #             if len(pair_trk_idx) and len(pair_det_idx):
 59 | #                 if weights[pair_trk_idx, pair_det_idx].mean() < 1:
 60 | #                     det2trk_weight[j].append(weights[pair_trk_idx, pair_det_idx].mean())
 61 | #                 else:
 62 | #                     print("1")
 63 | #                 if weights[pair_trk_idx, pair_trk_idx].mean() < 1:
 64 | #                     trk2trk_weight[j].append(weights[pair_trk_idx, pair_trk_idx].mean())
 65 | #                 else:
 66 | #                     print("1")
 67 | #                 if weights[pair_trk_idx, :60].sum(-1).mean() < 1:
 68 | #                     detall2trk_weight[j].append(weights[pair_trk_idx, :60].sum(-1).mean())
 69 | #                 else:
 70 | #                     print("1")
 71 | 
 72 | # print(np.array(det2trk_weight[0]).mean(), np.array(det2trk_weight[1]).mean(), np.array(det2trk_weight[2]).mean(), np.array(det2trk_weight[3]).mean(), np.array(det2trk_weight[4]).mean(), np.array(det2trk_weight[5]).mean())
 73 | # print(np.array(trk2trk_weight[0]).mean(), np.array(trk2trk_weight[1]).mean(), np.array(trk2trk_weight[2]).mean(), np.array(trk2trk_weight[3]).mean(), np.array(trk2trk_weight[4]).mean(), np.array(trk2trk_weight[5]).mean())
 74 | # print(np.array(detall2trk_weight[0]).mean(), np.array(detall2trk_weight[1]).mean(), np.array(detall2trk_weight[2]).mean(), np.array(detall2trk_weight[3]).mean(), np.array(detall2trk_weight[4]).mean(), np.array(detall2trk_weight[5]).mean())
 75 | 
 76 | hs_all = defaultdict(list)
 77 | hs_all_flatten = []
 78 | for i in range(703):
 79 |     scores = np.load(os.path.join(root_data, 'class_%08d_%d.txt.npy'%(i,5)))[0, :, 0]
 80 |     ids = np.load(os.path.join(root_data, 'ids_%08d.txt.npy'%(i)))[scores>0]
 81 |     hs = np.load(os.path.join(root_data, 'hs_%08d.txt.npy'%(i)))[scores>0]
 82 |     
 83 |     for id, h in zip(ids, hs):
 84 |         hs_all[id].append(h)
 85 |         hs_all_flatten.append(h)
 86 |  
 87 | pca = PCA(n_components=2)
 88 | # newX = pca.fit_transform(X)
 89 | pca.fit(hs_all_flatten) 
 90 | pca.transform(X)
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | stat_scores_det = defaultdict(lambda: defaultdict(int))
 97 | for line in np.loadtxt('tmp_det.txt'):
 98 | 	stat_scores_det[int(line[0])][int(line[1])] = line[2]
 99 | stat_scores_trk = defaultdict(lambda: defaultdict(int))
100 | for line in np.loadtxt('tmp_trk.txt'):
101 | 	stat_scores_trk[int(line[0])][int(line[1])] = line[2]
102 | stat_scores_uni_det = defaultdict(lambda: defaultdict(int))
103 | for line in np.loadtxt('tmp_uni_det.txt'):
104 | 	stat_scores_uni_det[int(line[0])][int(line[1])] = line[2]
105 | stat_scores_uni_trk = defaultdict(lambda: defaultdict(int))
106 | for line in np.loadtxt('tmp_uni_trk.txt'):
107 | 	stat_scores_uni_trk[int(line[0])][int(line[1])] = line[2]
108 |  
109 |  
110 | count_bin_all = defaultdict(list)
111 | count_bin = defaultdict(int)
112 | for framid in stat_scores_trk:
113 | 	for obj_id in stat_scores_trk[framid]:
114 | 		if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]:
115 | 			count_bin_all[int(stat_scores_trk[framid][obj_id]*10)].append(stat_scores_uni_trk[framid][obj_id]-stat_scores_trk[framid][obj_id])
116 | 			if stat_scores_trk[framid][obj_id] > stat_scores_uni_trk[framid][obj_id]:
117 | 				count_bin[int(stat_scores_trk[framid][obj_id]*10)] -= 1
118 | 			else:
119 | 				count_bin[int(stat_scores_trk[framid][obj_id]*10)] += 1
120 | for i in range(10):
121 |     print(np.array(count_bin_all[i]).mean(), np.array(count_bin_all[i]).std())
122 | 
123 |    
124 | with open('tmp.txt', 'w') as fp: 
125 | 	for framid in stat_scores_trk:
126 | 		for obj_id in stat_scores_trk[framid]:
127 | 			if framid in stat_scores_uni_trk and obj_id in stat_scores_uni_trk[framid]:
128 | 				# print(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id])
129 | 				fp.write('%f %f\n'%(stat_scores_trk[framid][obj_id], stat_scores_uni_trk[framid][obj_id]))
130 | 	


--------------------------------------------------------------------------------
/tools/visualize_tao.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | 
  5 | 
  6 | from collections import defaultdict
  7 | from glob import glob
  8 | import json
  9 | import os
 10 | import cv2
 11 | import numpy as np
 12 | import subprocess
 13 | import random
 14 | from tqdm import tqdm
 15 | from PIL import Image, ImageDraw
 16 | 
 17 | from scipy.optimize import linear_sum_assignment as linear_assignment
 18 | 
 19 | # 计算两个box的IOU
 20 | def bboxes_iou(bboxes1,bboxes2):
 21 | 	bboxes1 = np.transpose(bboxes1)
 22 | 	bboxes2 = np.transpose(bboxes2)
 23 | 
 24 | 	# 计算两个box的交集：交集左上角的点取两个box的max，交集右下角的点取两个box的min
 25 | 	int_ymin = np.maximum(bboxes1[0][:, None], bboxes2[0])
 26 | 	int_xmin = np.maximum(bboxes1[1][:, None], bboxes2[1])
 27 | 	int_ymax = np.minimum(bboxes1[2][:, None], bboxes2[2])
 28 | 	int_xmax = np.minimum(bboxes1[3][:, None], bboxes2[3])
 29 | 
 30 | 	# 计算两个box交集的wh：如果两个box没有交集，那么wh为0(按照计算方式wh为负数，跟0比较取最大值)
 31 | 	int_h = np.maximum(int_ymax-int_ymin,0.)
 32 | 	int_w = np.maximum(int_xmax-int_xmin,0.)
 33 | 
 34 | 	# 计算IOU
 35 | 	int_vol = int_h * int_w # 交集面积
 36 | 	vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1]) # bboxes1面积
 37 | 	vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1]) # bboxes2面积
 38 | 	IOU = int_vol / (vol1[:, None] + vol2 - int_vol) # IOU=交集/并集
 39 | 	return IOU
 40 | 
 41 | def get_color(i):
 42 |     return [(i * 23 * j + 43) % 255 for j in range(3)]
 43 | 
 44 | 
 45 | def show_gt(img_list, output="output.mp4"):
 46 |     h, w, _ = cv2.imread(img_list[0]).shape
 47 |     command = [
 48 |         "anaconda3/envs/detrex/bin/ffmpeg",
 49 |         '-y',  # overwrite output file if it exists
 50 |         '-f', 'rawvideo',
 51 |         '-vcodec','rawvideo',
 52 |         '-s', f'{w}x{h}',  # size of one frame
 53 |         '-pix_fmt', 'bgr24',
 54 |         '-r', '20',  # frames per second
 55 |         '-i', '-',  # The imput comes from a pipe
 56 |         '-s', f'{w//2*2}x{h//2*2}',
 57 |         '-an',  # Tells FFMPEG not to expect any audio
 58 |         '-loglevel', 'error',
 59 |         # '-crf', '26',
 60 |         '-b:v', '0',
 61 |         '-pix_fmt', 'yuv420p'
 62 |     ]
 63 |     # writing_process = subprocess.Popen(command + [output], stdin=subprocess.PIPE)
 64 |     fps = 16 
 65 |     size = (w,h) 
 66 |     videowriter = cv2.VideoWriter(output,cv2.VideoWriter_fourcc('M','J','P','G'), fps, size)
 67 | 
 68 | 
 69 |     for i, path in enumerate(tqdm(sorted(img_list))):
 70 |         im = cv2.imread(path)
 71 |         det_bboxes = []
 72 |         motr_bboxes = []
 73 |         for det in det_db[path.replace('data/', '').replace('.jpg', '.txt').replace('dancetrack/', 'DanceTrack/')]:
 74 |             x1, y1, w, h, s = map(float, det.strip().split(','))
 75 |             x1, y1, w, h = map(int, [x1, y1, w, h])
 76 |             im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2)
 77 |             im = cv2.putText(im, '%0.2f'%s, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1)
 78 |             det_bboxes.append([x1, y1, x1+w, y1+h])
 79 | 
 80 |         det_bboxes = np.array(det_bboxes)
 81 |         motr_bboxes = np.array(motr_bboxes)
 82 |         ious = bboxes_iou(det_bboxes, motr_bboxes)
 83 |         matching = linear_assignment(-ious)
 84 |         matched = sum(ious[matching[0], matching[1]] > 0.5)
 85 |         im = cv2.putText(im, f"{matched}/{len(det_bboxes)}/{len(motr_bboxes)}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, get_color(j), 3)
 86 |         cv2.putText(im, "{}".format(os.path.basename(path)[:-4]), (120,120), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 6)
 87 |         # writing_process.stdin.write(im.tobytes())
 88 |         videowriter.write(im)
 89 |         
 90 |     videowriter.release()
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 | 
 95 |     labels_full = defaultdict(lambda : defaultdict(list))
 96 |     imgid2name = defaultdict()
 97 |     def _add_mot_folder(mot_path, split_dir):
 98 |         print("Adding", split_dir)
 99 |         labels = json.load(open(os.path.join(mot_path, split_dir)))
100 |         for ann in labels['images']:
101 |             imgid2name[ann['id']] = ann['file_name']
102 |         for ann in labels['annotations']:
103 |             vid = ann['video_id']
104 |             t = ann['image_id']
105 |             x, y, w, h = ann['bbox']
106 |             i = ann['track_id']
107 |             crowd = ann['iscrowd']
108 |             cl = ann['category_id']
109 |             labels_full[vid][t].append([x, y, w, h, i, crowd, cl])
110 |         return labels_full, imgid2name
111 |     
112 |     mot_path = 'data/'
113 |     labels_full, imgid2name = _add_mot_folder(mot_path, 'tao/annotations/train.json')
114 |     indices = []
115 |     vid_files = list(labels_full.keys())
116 |     for vid in vid_files:
117 |         t_min = min(labels_full[vid].keys())
118 |         t_max = max(labels_full[vid].keys()) + 1
119 |         for t in range(t_min, t_max):
120 |             indices.append((vid, t))
121 |           
122 |     vid_old = None
123 |     random.shuffle(vid_files)
124 |     videowriter = None
125 |     for vid in vid_files:
126 |         print(vid)
127 |         t_min = min(labels_full[vid].keys())
128 |         t_max = max(labels_full[vid].keys()) + 1
129 |         for idx in range(t_min, t_max):
130 |         # vid, idx = indices[idx]
131 |             img_path = os.path.join(mot_path, 'tao/frames', imgid2name[idx])
132 |             img = Image.open(img_path)
133 |             if vid != vid_old:
134 |                 vid_old = vid
135 |                 w, h = img._size
136 |                 fps = 1
137 |                 size = (w,h) 
138 |                 if videowriter is not None:
139 |                     videowriter.release()
140 |                 videowriter = cv2.VideoWriter('tmp/'+imgid2name[idx].split('/')[-2]+'.avi',cv2.VideoWriter_fourcc('M','J','P','G'), fps, size)
141 |             im = np.array(img)
142 |             for *xywh, id, crowd, cl in labels_full[vid][idx]:
143 |                 x1, y1, w, h = xywh
144 |                 x1, y1, w, h = map(int, [x1, y1, w, h])
145 |                 im = cv2.rectangle(im, (x1, y1), (x1+w, y1+h), (255, 255, 255), 2)
146 |                 im = cv2.putText(im, '%d'%id, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1)
147 |             videowriter.write(im)
148 |     
149 |     videowriter.release()
150 |     


--------------------------------------------------------------------------------
/models/memory_bank.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch import nn, Tensor
  8 | 
  9 | from typing import List
 10 | 
 11 | from models.structures import Instances
 12 | 
 13 | 
 14 | class MemoryBank(nn.Module):
 15 |     def __init__(self, args, dim_in, hidden_dim, dim_out):
 16 |         super().__init__()
 17 |         self._build_layers(args, dim_in, hidden_dim, dim_out)
 18 |         for p in self.parameters():
 19 |             if p.dim() > 1:
 20 |                 nn.init.xavier_uniform_(p)
 21 | 
 22 |     def _build_layers(self, args, dim_in, hidden_dim, dim_out):
 23 |         self.save_thresh = args.memory_bank_score_thresh
 24 |         self.save_period = 3
 25 |         self.max_his_length = args.memory_bank_len
 26 | 
 27 |         self.save_proj = nn.Linear(dim_in, dim_in)
 28 | 
 29 |         self.temporal_attn = nn.MultiheadAttention(dim_in, 8, dropout=0)
 30 |         self.temporal_fc1 = nn.Linear(dim_in, hidden_dim)
 31 |         self.temporal_fc2 = nn.Linear(hidden_dim, dim_in)
 32 |         self.temporal_norm1 = nn.LayerNorm(dim_in)
 33 |         self.temporal_norm2 = nn.LayerNorm(dim_in)
 34 | 
 35 |         self.track_cls = nn.Linear(dim_in, 1)
 36 | 
 37 |         self.self_attn = None
 38 |         if args.memory_bank_with_self_attn:
 39 |             self.spatial_attn = nn.MultiheadAttention(dim_in, 8, dropout=0)
 40 |             self.spatial_fc1 = nn.Linear(dim_in, hidden_dim)
 41 |             self.spatial_fc2 = nn.Linear(hidden_dim, dim_in)
 42 |             self.spatial_norm1 = nn.LayerNorm(dim_in)
 43 |             self.spatial_norm2 = nn.LayerNorm(dim_in)
 44 |         else:
 45 |             self.spatial_attn = None
 46 | 
 47 |     def update(self, track_instances):
 48 |         embed = track_instances.output_embedding[:, None]  #( N, 1, 256)
 49 |         scores = track_instances.scores
 50 |         mem_padding_mask = track_instances.mem_padding_mask
 51 |         device = embed.device
 52 | 
 53 |         save_period = track_instances.save_period
 54 |         if self.training:
 55 |             saved_idxes = scores > 0
 56 |         else:
 57 |             saved_idxes = (save_period == 0) & (scores > self.save_thresh)
 58 |             # saved_idxes = (save_period == 0)
 59 |             save_period[save_period > 0] -= 1
 60 |             save_period[saved_idxes] = self.save_period
 61 | 
 62 |         saved_embed = embed[saved_idxes]
 63 |         if len(saved_embed) > 0:
 64 |             prev_embed = track_instances.mem_bank[saved_idxes]
 65 |             save_embed = self.save_proj(saved_embed)
 66 |             mem_padding_mask[saved_idxes] = torch.cat([mem_padding_mask[saved_idxes, 1:], torch.zeros((len(saved_embed), 1), dtype=torch.bool, device=device)], dim=1)
 67 |             track_instances.mem_bank = track_instances.mem_bank.clone()
 68 |             track_instances.mem_bank[saved_idxes] = torch.cat([prev_embed[:, 1:], save_embed], dim=1)
 69 | 
 70 |     def _forward_spatial_attn(self, track_instances):
 71 |         if len(track_instances) == 0:
 72 |             return track_instances
 73 | 
 74 |         embed = track_instances.output_embedding
 75 |         dim = embed.shape[-1]
 76 |         query_pos = track_instances.query_pos[:, :dim]  # 应该为query_pos = pos2posemb(track_instances.ref_pts)
 77 |         k = q = (embed + query_pos)
 78 |         v = embed
 79 |         embed2 = self.spatial_attn(
 80 |             q[:, None],
 81 |             k[:, None],
 82 |             v[:, None]
 83 |         )[0][:, 0]
 84 |         embed = self.spatial_norm1(embed + embed2)
 85 |         embed2 = self.spatial_fc2(F.relu(self.spatial_fc1(embed)))
 86 |         embed = self.spatial_norm2(embed + embed2)
 87 |         track_instances.output_embedding = embed
 88 |         return track_instances
 89 | 
 90 |     def _forward_track_cls(self, track_instances):
 91 |         track_instances.track_scores = self.track_cls(track_instances.output_embedding)[..., 0]
 92 |         return track_instances
 93 | 
 94 |     def _forward_temporal_attn(self, track_instances):
 95 |         if len(track_instances) == 0:
 96 |             return track_instances
 97 | 
 98 |         dim = track_instances.query_pos.shape[1]
 99 |         key_padding_mask = track_instances.mem_padding_mask
100 | 
101 |         valid_idxes = key_padding_mask[:, -1] == 0
102 |         embed = track_instances.output_embedding[valid_idxes]  # (n, 256)
103 | 
104 |         if len(embed) > 0:
105 |             prev_embed = track_instances.mem_bank[valid_idxes]
106 |             key_padding_mask = key_padding_mask[valid_idxes]
107 |             embed2 = self.temporal_attn(
108 |                 embed[None],                  # (num_track, dim) to (1, num_track, dim)
109 |                 prev_embed.transpose(0, 1),   # (num_track, mem_len, dim) to (mem_len, num_track, dim)
110 |                 prev_embed.transpose(0, 1),
111 |                 key_padding_mask=key_padding_mask,
112 |             )[0][0]
113 | 
114 |             embed = self.temporal_norm1(embed + embed2)
115 |             embed2 = self.temporal_fc2(F.relu(self.temporal_fc1(embed)))
116 |             embed = self.temporal_norm2(embed + embed2)
117 |             track_instances.output_embedding = track_instances.output_embedding.clone()
118 |             track_instances.output_embedding[valid_idxes] = embed
119 | 
120 |         return track_instances
121 | 
122 |     def forward_temporal_attn(self, track_instances):
123 |         return self._forward_temporal_attn(track_instances)
124 | 
125 |     def forward(self, track_instances: Instances, update_bank=True) -> Instances:
126 |         track_instances = self._forward_temporal_attn(track_instances)
127 |         if update_bank:
128 |             self.update(track_instances)
129 |         if self.spatial_attn is not None:
130 |             track_instances = self._forward_spatial_attn(track_instances)
131 |         if self.track_cls is not None:
132 |             track_instances = self._forward_track_cls(track_instances)
133 |         return track_instances
134 | 
135 | 
136 | def build_memory_bank(args, dim_in, hidden_dim, dim_out):
137 |     name = args.memory_bank_type
138 |     memory_banks = {
139 |         'MemoryBank': MemoryBank,
140 |     }
141 |     assert name in memory_banks
142 |     return memory_banks[name](args, dim_in, hidden_dim, dim_out)
143 | 


--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | import os
 13 | import math
 14 | import torch
 15 | import torch.distributed as dist
 16 | from torch.utils.data.sampler import Sampler
 17 | 
 18 | 
 19 | class DistributedSampler(Sampler):
 20 |     """Sampler that restricts data loading to a subset of the dataset.
 21 |     It is especially useful in conjunction with
 22 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 23 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 24 |     and load a subset of the original dataset that is exclusive to it.
 25 |     .. note::
 26 |         Dataset is assumed to be of constant size.
 27 |     Arguments:
 28 |         dataset: Dataset used for sampling.
 29 |         num_replicas (optional): Number of processes participating in
 30 |             distributed training.
 31 |         rank (optional): Rank of the current process within num_replicas.
 32 |     """
 33 | 
 34 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 35 |         if num_replicas is None:
 36 |             if not dist.is_available():
 37 |                 raise RuntimeError("Requires distributed package to be available")
 38 |             num_replicas = dist.get_world_size()
 39 |         if rank is None:
 40 |             if not dist.is_available():
 41 |                 raise RuntimeError("Requires distributed package to be available")
 42 |             rank = dist.get_rank()
 43 |         self.dataset = dataset
 44 |         self.num_replicas = num_replicas
 45 |         self.rank = rank
 46 |         self.epoch = 0
 47 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 48 |         self.total_size = self.num_samples * self.num_replicas
 49 |         self.shuffle = shuffle
 50 | 
 51 |     def __iter__(self):
 52 |         if self.shuffle:
 53 |             # deterministically shuffle based on epoch
 54 |             g = torch.Generator()
 55 |             g.manual_seed(self.epoch)
 56 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 57 |         else:
 58 |             indices = torch.arange(len(self.dataset)).tolist()
 59 | 
 60 |         # add extra samples to make it evenly divisible
 61 |         if len(indices) * 2 < self.total_size:
 62 |             tmp = indices * self.total_size
 63 |             indices += tmp[: (self.total_size - len(indices))]
 64 |         else:
 65 |             indices += indices[: (self.total_size - len(indices))]
 66 |         assert len(indices) == self.total_size
 67 | 
 68 |         # subsample
 69 |         offset = self.num_samples * self.rank
 70 |         indices = indices[offset : offset + self.num_samples]
 71 |         assert len(indices) == self.num_samples
 72 | 
 73 |         return iter(indices)
 74 | 
 75 |     def __len__(self):
 76 |         return self.num_samples
 77 | 
 78 |     def set_epoch(self, epoch):
 79 |         self.epoch = epoch
 80 | 
 81 | 
 82 | class NodeDistributedSampler(Sampler):
 83 |     """Sampler that restricts data loading to a subset of the dataset.
 84 |     It is especially useful in conjunction with
 85 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 86 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 87 |     and load a subset of the original dataset that is exclusive to it.
 88 |     .. note::
 89 |         Dataset is assumed to be of constant size.
 90 |     Arguments:
 91 |         dataset: Dataset used for sampling.
 92 |         num_replicas (optional): Number of processes participating in
 93 |             distributed training.
 94 |         rank (optional): Rank of the current process within num_replicas.
 95 |     """
 96 | 
 97 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 98 |         if num_replicas is None:
 99 |             if not dist.is_available():
100 |                 raise RuntimeError("Requires distributed package to be available")
101 |             num_replicas = dist.get_world_size()
102 |         if rank is None:
103 |             if not dist.is_available():
104 |                 raise RuntimeError("Requires distributed package to be available")
105 |             rank = dist.get_rank()
106 |         if local_rank is None:
107 |             local_rank = int(os.environ.get('LOCAL_RANK', 0))
108 |         if local_size is None:
109 |             local_size = int(os.environ.get('LOCAL_SIZE', 1))
110 |         self.dataset = dataset
111 |         self.shuffle = shuffle
112 |         self.num_replicas = num_replicas
113 |         self.num_parts = local_size
114 |         self.rank = rank
115 |         self.local_rank = local_rank
116 |         self.epoch = 0
117 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
118 |         self.total_size = self.num_samples * self.num_replicas
119 | 
120 |         self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
121 | 
122 |     def __iter__(self):
123 |         if self.shuffle:
124 |             # deterministically shuffle based on epoch
125 |             g = torch.Generator()
126 |             g.manual_seed(self.epoch)
127 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
128 |         else:
129 |             indices = torch.arange(len(self.dataset)).tolist()
130 |         indices = [i for i in indices if i % self.num_parts == self.local_rank]
131 | 
132 |         # add extra samples to make it evenly divisible
133 |         indices += indices[:(self.total_size_parts - len(indices))]
134 |         assert len(indices) == self.total_size_parts
135 | 
136 |         # subsample
137 |         indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
138 |         assert len(indices) == self.num_samples
139 | 
140 |         return iter(indices)
141 | 
142 |     def __len__(self):
143 |         return self.num_samples
144 | 
145 |     def set_epoch(self, epoch):
146 |         self.epoch = epoch
147 | 


--------------------------------------------------------------------------------
/models/yolox.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4 | 
  5 | import torch
  6 | from torch import nn, Tensor
  7 | from typing import List
  8 | 
  9 | from .yolo_head import YOLOXHead
 10 | from .yolo_pafpn import YOLOPAFPN
 11 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
 12 |                        accuracy, get_world_size, interpolate, get_rank,
 13 |                        is_dist_avail_and_initialized, inverse_sigmoid)
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisibility: int = 0):
 25 |     # TODO make this more general
 26 |     if tensor_list[0].ndim == 3:
 27 |         # TODO make it support different-sized images
 28 | 
 29 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 30 |         if size_divisibility > 0:
 31 |             stride = size_divisibility
 32 |             # the last two dims are H,W, both subject to divisibility requirement
 33 |             max_size[-1] = (max_size[-1] + (stride - 1)) // stride * stride
 34 |             max_size[-2] = (max_size[-2] + (stride - 1)) // stride * stride
 35 | 
 36 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 37 |         batch_shape = [len(tensor_list)] + max_size
 38 |         b, c, h, w = batch_shape
 39 |         dtype = tensor_list[0].dtype
 40 |         device = tensor_list[0].device
 41 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 42 |         for img, pad_img in zip(tensor_list, tensor):
 43 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 44 |     else:
 45 |         raise ValueError('not supported')
 46 |     return tensor
 47 | 
 48 | 
 49 | 
 50 | class YOLOX(nn.Module):
 51 |     """
 52 |     YOLOX model module. The module list is defined by create_yolov3_modules function.
 53 |     The network returns loss values from three YOLO layers during training
 54 |     and detection results during test.
 55 |     """
 56 | 
 57 |     def __init__(self, backbone=None, head=None):
 58 |         super().__init__()
 59 |         if backbone is None:
 60 |             backbone = YOLOPAFPN()
 61 |         if head is None:
 62 |             head = YOLOXHead(80)
 63 | 
 64 |         self.backbone = backbone
 65 |         self.head = head
 66 | 
 67 |     def forward(self, x, targets=None):
 68 |         # fpn output content features of [dark3, dark4, dark5]
 69 |         fpn_outs = self.backbone(x)
 70 | 
 71 |         if self.training:
 72 |             assert targets is not None
 73 |             loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
 74 |                 fpn_outs, targets, x
 75 |             )
 76 |             outputs = {
 77 |                 "total_loss": loss,
 78 |                 "iou_loss": iou_loss,
 79 |                 "l1_loss": l1_loss,
 80 |                 "conf_loss": conf_loss,
 81 |                 "cls_loss": cls_loss,
 82 |                 "num_fg": num_fg,
 83 |             }
 84 |         else:
 85 |             outputs = self.head(fpn_outs)
 86 | 
 87 |         return outputs
 88 | 
 89 |     @torch.no_grad()
 90 |     def inference_single_image(self, img, ori_img_size, track_instances=None):
 91 |         if not isinstance(img, NestedTensor):
 92 |             img = nested_tensor_from_tensor_list(img, size_divisibility=32)
 93 |         output = self.forward(img)
 94 |         
 95 |         out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
 96 |         
 97 |         # _, _, img_h, img_w = img.shape
 98 |         # scale = max(ori_img_size[0]/img_h, ori_img_size[1]/img_w)
 99 |         # output[..., :4] *= scale
100 |         # output = output[output[..., 4]>0.5]
101 |         
102 |         # import cv2
103 |         # res[..., [0,1]] -= res[..., [2,3]]/2
104 |         # res[..., [2,3]] += res[..., [0,1]]
105 |         # ori_img = ori_img.cpu().numpy()
106 |         # for o in res.cpu().numpy():
107 |         #     cv2.rectangle(ori_img, pt1 = (int(o[0]), int(0[1])), pt2 =(int(o[2]), int(0[3])), color = (0, 0, 255), thickness = 2)
108 |         # cv2.imwrite('tmp.png', ori_img)
109 |         return output
110 | 
111 | 
112 | class PostProcess(nn.Module):
113 |     """ This module converts the model's output into the format expected by the coco api"""
114 | 
115 |     @torch.no_grad()
116 |     def forward(self, outputs, target_sizes):
117 |         """ Perform the computation
118 |         Parameters:
119 |             outputs: raw outputs of the model
120 |             target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
121 |                           For evaluation, this must be the original image size (before any data augmentation)
122 |                           For visualization, this should be the image size after data augment, but before padding
123 |         """
124 |         out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
125 | 
126 |         assert len(out_logits) == len(target_sizes)
127 |         assert target_sizes.shape[1] == 2
128 | 
129 |         prob = out_logits.sigmoid()
130 |         topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
131 |         scores = topk_values
132 |         topk_boxes = topk_indexes // out_logits.shape[2]
133 |         labels = topk_indexes % out_logits.shape[2]
134 |         boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
135 |         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
136 | 
137 |         # and from relative [0, 1] to absolute [0, height] coordinates
138 |         img_h, img_w = target_sizes.unbind(1)
139 |         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
140 |         boxes = boxes * scale_fct[:, None, :]
141 | 
142 |         results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
143 | 
144 |         return results
145 | 
146 | 
147 | def build(args):
148 |     
149 |     def init_yolo(M):
150 |         for m in M.modules():
151 |             if isinstance(m, nn.BatchNorm2d):
152 |                 m.eps = 1e-3
153 |                 m.momentum = 0.03
154 | 
155 |     in_channels = [256, 512, 1024]
156 |     depth = 1.33
157 |     width = 1.25
158 |     num_classes = 1
159 |     backbone = YOLOPAFPN(depth, width, in_channels=in_channels)
160 |     head = YOLOXHead(num_classes, width, in_channels=in_channels)
161 |     model = YOLOX(backbone, head)
162 | 
163 |     model.apply(init_yolo)
164 |     model.head.initialize_biases(1e-2)
165 |     
166 |     return model, None, None


--------------------------------------------------------------------------------
/models/dino/position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Conditional DETR
  7 | # Copyright (c) 2021 Microsoft. All Rights Reserved.
  8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  9 | # ------------------------------------------------------------------------
 10 | # Copied from DETR (https://github.com/facebookresearch/detr)
 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 12 | # ------------------------------------------------------------------------
 13 | 
 14 | """
 15 | Various positional encodings for the transformer.
 16 | """
 17 | import math
 18 | import torch
 19 | from torch import nn
 20 | 
 21 | from util.misc import NestedTensor
 22 | 
 23 | 
 24 | class PositionEmbeddingSine(nn.Module):
 25 |     """
 26 |     This is a more standard version of the position embedding, very similar to the one
 27 |     used by the Attention is all you need paper, generalized to work on images.
 28 |     """
 29 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 30 |         super().__init__()
 31 |         self.num_pos_feats = num_pos_feats
 32 |         self.temperature = temperature
 33 |         self.normalize = normalize
 34 |         if scale is not None and normalize is False:
 35 |             raise ValueError("normalize should be True if scale is passed")
 36 |         if scale is None:
 37 |             scale = 2 * math.pi
 38 |         self.scale = scale
 39 | 
 40 |     def forward(self, tensor_list: NestedTensor):
 41 |         x = tensor_list.tensors
 42 |         mask = tensor_list.mask
 43 |         assert mask is not None
 44 |         not_mask = ~mask
 45 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 46 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 47 |         if self.normalize:
 48 |             eps = 1e-6
 49 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 50 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 51 | 
 52 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 53 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 54 | 
 55 |         pos_x = x_embed[:, :, :, None] / dim_t
 56 |         pos_y = y_embed[:, :, :, None] / dim_t
 57 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 58 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 59 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 60 |         return pos
 61 | 
 62 | class PositionEmbeddingSineHW(nn.Module):
 63 |     """
 64 |     This is a more standard version of the position embedding, very similar to the one
 65 |     used by the Attention is all you need paper, generalized to work on images.
 66 |     """
 67 |     def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
 68 |         super().__init__()
 69 |         self.num_pos_feats = num_pos_feats
 70 |         self.temperatureH = temperatureH
 71 |         self.temperatureW = temperatureW
 72 |         self.normalize = normalize
 73 |         if scale is not None and normalize is False:
 74 |             raise ValueError("normalize should be True if scale is passed")
 75 |         if scale is None:
 76 |             scale = 2 * math.pi
 77 |         self.scale = scale
 78 | 
 79 |     def forward(self, tensor_list: NestedTensor):
 80 |         x = tensor_list.tensors
 81 |         mask = tensor_list.mask
 82 |         assert mask is not None
 83 |         not_mask = ~mask
 84 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 85 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 86 | 
 87 | 
 88 | 
 89 |         if self.normalize:
 90 |             eps = 1e-6
 91 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 92 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 93 | 
 94 |         dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 95 |         dim_tx = self.temperatureW ** (2 * torch.div(dim_tx, 2, rounding_mode="floor") / self.num_pos_feats)
 96 |         pos_x = x_embed[:, :, :, None] / dim_tx
 97 | 
 98 |         dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 99 |         dim_ty = self.temperatureH ** (2 * torch.div(dim_ty, 2, rounding_mode="floor") / self.num_pos_feats)
100 |         pos_y = y_embed[:, :, :, None] / dim_ty
101 | 
102 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
103 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
104 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
105 | 
106 | 
107 | 
108 |         return pos
109 | 
110 | class PositionEmbeddingLearned(nn.Module):
111 |     """
112 |     Absolute pos embedding, learned.
113 |     """
114 |     def __init__(self, num_pos_feats=256):
115 |         super().__init__()
116 |         self.row_embed = nn.Embedding(50, num_pos_feats)
117 |         self.col_embed = nn.Embedding(50, num_pos_feats)
118 |         self.reset_parameters()
119 | 
120 |     def reset_parameters(self):
121 |         nn.init.uniform_(self.row_embed.weight)
122 |         nn.init.uniform_(self.col_embed.weight)
123 | 
124 |     def forward(self, tensor_list: NestedTensor):
125 |         x = tensor_list.tensors
126 |         h, w = x.shape[-2:]
127 |         i = torch.arange(w, device=x.device)
128 |         j = torch.arange(h, device=x.device)
129 |         x_emb = self.col_embed(i)
130 |         y_emb = self.row_embed(j)
131 |         pos = torch.cat([
132 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
133 |             y_emb.unsqueeze(1).repeat(1, w, 1),
134 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
135 |         return pos
136 | 
137 | 
138 | def build_position_encoding(args):
139 |     N_steps = args.hidden_dim // 2
140 |     if args.position_embedding in ('v2', 'sine'):
141 |         # TODO find a better way of exposing other arguments
142 |         position_embedding = PositionEmbeddingSineHW(
143 |             N_steps, 
144 |             temperatureH=args.pe_temperatureH,
145 |             temperatureW=args.pe_temperatureW,
146 |             normalize=True
147 |         )
148 |     elif args.position_embedding in ('v3', 'learned'):
149 |         position_embedding = PositionEmbeddingLearned(N_steps)
150 |     else:
151 |         raise ValueError(f"not supported {args.position_embedding}")
152 | 
153 |     return position_embedding
154 | 


--------------------------------------------------------------------------------
/models/darknet.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4 | 
  5 | from torch import nn
  6 | 
  7 | from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
  8 | 
  9 | 
 10 | class Darknet(nn.Module):
 11 |     # number of blocks from dark2 to dark5.
 12 |     depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         depth,
 17 |         in_channels=3,
 18 |         stem_out_channels=32,
 19 |         out_features=("dark3", "dark4", "dark5"),
 20 |     ):
 21 |         """
 22 |         Args:
 23 |             depth (int): depth of darknet used in model, usually use [21, 53] for this param.
 24 |             in_channels (int): number of input channels, for example, use 3 for RGB image.
 25 |             stem_out_channels (int): number of output chanels of darknet stem.
 26 |                 It decides channels of darknet layer2 to layer5.
 27 |             out_features (Tuple[str]): desired output layer name.
 28 |         """
 29 |         super().__init__()
 30 |         assert out_features, "please provide output features of Darknet"
 31 |         self.out_features = out_features
 32 |         self.stem = nn.Sequential(
 33 |             BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
 34 |             *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
 35 |         )
 36 |         in_channels = stem_out_channels * 2  # 64
 37 | 
 38 |         num_blocks = Darknet.depth2blocks[depth]
 39 |         # create darknet with `stem_out_channels` and `num_blocks` layers.
 40 |         # to make model structure more clear, we don't use `for` statement in python.
 41 |         self.dark2 = nn.Sequential(
 42 |             *self.make_group_layer(in_channels, num_blocks[0], stride=2)
 43 |         )
 44 |         in_channels *= 2  # 128
 45 |         self.dark3 = nn.Sequential(
 46 |             *self.make_group_layer(in_channels, num_blocks[1], stride=2)
 47 |         )
 48 |         in_channels *= 2  # 256
 49 |         self.dark4 = nn.Sequential(
 50 |             *self.make_group_layer(in_channels, num_blocks[2], stride=2)
 51 |         )
 52 |         in_channels *= 2  # 512
 53 | 
 54 |         self.dark5 = nn.Sequential(
 55 |             *self.make_group_layer(in_channels, num_blocks[3], stride=2),
 56 |             *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
 57 |         )
 58 | 
 59 |     def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
 60 |         "starts with conv layer then has `num_blocks` `ResLayer`"
 61 |         return [
 62 |             BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
 63 |             *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
 64 |         ]
 65 | 
 66 |     def make_spp_block(self, filters_list, in_filters):
 67 |         m = nn.Sequential(
 68 |             *[
 69 |                 BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
 70 |                 BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
 71 |                 SPPBottleneck(
 72 |                     in_channels=filters_list[1],
 73 |                     out_channels=filters_list[0],
 74 |                     activation="lrelu",
 75 |                 ),
 76 |                 BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
 77 |                 BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
 78 |             ]
 79 |         )
 80 |         return m
 81 | 
 82 |     def forward(self, x):
 83 |         outputs = {}
 84 |         x = self.stem(x)
 85 |         outputs["stem"] = x
 86 |         x = self.dark2(x)
 87 |         outputs["dark2"] = x
 88 |         x = self.dark3(x)
 89 |         outputs["dark3"] = x
 90 |         x = self.dark4(x)
 91 |         outputs["dark4"] = x
 92 |         x = self.dark5(x)
 93 |         outputs["dark5"] = x
 94 |         return {k: v for k, v in outputs.items() if k in self.out_features}
 95 | 
 96 | 
 97 | class CSPDarknet(nn.Module):
 98 |     def __init__(
 99 |         self,
100 |         dep_mul,
101 |         wid_mul,
102 |         out_features=("dark3", "dark4", "dark5"),
103 |         depthwise=False,
104 |         act="silu",
105 |     ):
106 |         super().__init__()
107 |         assert out_features, "please provide output features of Darknet"
108 |         self.out_features = out_features
109 |         Conv = DWConv if depthwise else BaseConv
110 | 
111 |         base_channels = int(wid_mul * 64)  # 64
112 |         base_depth = max(round(dep_mul * 3), 1)  # 3
113 | 
114 |         # stem
115 |         self.stem = Focus(3, base_channels, ksize=3, act=act)
116 | 
117 |         # dark2
118 |         self.dark2 = nn.Sequential(
119 |             Conv(base_channels, base_channels * 2, 3, 2, act=act),
120 |             CSPLayer(
121 |                 base_channels * 2,
122 |                 base_channels * 2,
123 |                 n=base_depth,
124 |                 depthwise=depthwise,
125 |                 act=act,
126 |             ),
127 |         )
128 | 
129 |         # dark3
130 |         self.dark3 = nn.Sequential(
131 |             Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
132 |             CSPLayer(
133 |                 base_channels * 4,
134 |                 base_channels * 4,
135 |                 n=base_depth * 3,
136 |                 depthwise=depthwise,
137 |                 act=act,
138 |             ),
139 |         )
140 | 
141 |         # dark4
142 |         self.dark4 = nn.Sequential(
143 |             Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
144 |             CSPLayer(
145 |                 base_channels * 8,
146 |                 base_channels * 8,
147 |                 n=base_depth * 3,
148 |                 depthwise=depthwise,
149 |                 act=act,
150 |             ),
151 |         )
152 | 
153 |         # dark5
154 |         self.dark5 = nn.Sequential(
155 |             Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
156 |             SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
157 |             CSPLayer(
158 |                 base_channels * 16,
159 |                 base_channels * 16,
160 |                 n=base_depth,
161 |                 shortcut=False,
162 |                 depthwise=depthwise,
163 |                 act=act,
164 |             ),
165 |         )
166 | 
167 |     def forward(self, x):
168 |         outputs = {}
169 |         x = self.stem(x)
170 |         outputs["stem"] = x
171 |         x = self.dark2(x)
172 |         outputs["dark2"] = x
173 |         x = self.dark3(x)
174 |         outputs["dark3"] = x
175 |         x = self.dark4(x)
176 |         outputs["dark4"] = x
177 |         x = self.dark5(x)
178 |         outputs["dark5"] = x
179 |         return {k: v for k, v in outputs.items() if k in self.out_features}
180 | 


--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | """
 13 | Plotting utilities to visualize training logs.
 14 | """
 15 | import cv2
 16 | import torch
 17 | import pandas as pd
 18 | import numpy as np
 19 | import seaborn as sns
 20 | import matplotlib.pyplot as plt
 21 | 
 22 | from torch import Tensor
 23 | 
 24 | from pathlib import Path, PurePath
 25 | 
 26 | 
 27 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
 28 |     '''
 29 |     Function to plot specific fields from training log(s). Plots both training and test results.
 30 | 
 31 |     :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
 32 |               - fields = which results to plot from each log file - plots both training and test for each field.
 33 |               - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
 34 |               - log_name = optional, name of log file if different than default 'log.txt'.
 35 | 
 36 |     :: Outputs - matplotlib plots of results in fields, color coded for each log file.
 37 |                - solid lines are training results, dashed lines are test results.
 38 | 
 39 |     '''
 40 |     func_name = "plot_utils.py::plot_logs"
 41 | 
 42 |     # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
 43 |     # convert single Path to list to avoid 'not iterable' error
 44 | 
 45 |     if not isinstance(logs, list):
 46 |         if isinstance(logs, PurePath):
 47 |             logs = [logs]
 48 |             print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
 49 |         else:
 50 |             raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
 51 |             Expect list[Path] or single Path obj, received {type(logs)}")
 52 | 
 53 |     # verify valid dir(s) and that every item in list is Path object
 54 |     for i, dir in enumerate(logs):
 55 |         if not isinstance(dir, PurePath):
 56 |             raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
 57 |         if dir.exists():
 58 |             continue
 59 |         raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
 60 | 
 61 |     # load log file(s) and plot
 62 |     dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
 63 | 
 64 |     fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
 65 | 
 66 |     for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
 67 |         for j, field in enumerate(fields):
 68 |             if field == 'mAP':
 69 |                 coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
 70 |                 axs[j].plot(coco_eval, c=color)
 71 |             else:
 72 |                 df.interpolate().ewm(com=ewm_col).mean().plot(
 73 |                     y=[f'train_{field}', f'test_{field}'],
 74 |                     ax=axs[j],
 75 |                     color=[color] * 2,
 76 |                     style=['-', '--']
 77 |                 )
 78 |     for ax, field in zip(axs, fields):
 79 |         ax.legend([Path(p).name for p in logs])
 80 |         ax.set_title(field)
 81 | 
 82 | 
 83 | def plot_precision_recall(files, naming_scheme='iter'):
 84 |     if naming_scheme == 'exp_id':
 85 |         # name becomes exp_id
 86 |         names = [f.parts[-3] for f in files]
 87 |     elif naming_scheme == 'iter':
 88 |         names = [f.stem for f in files]
 89 |     else:
 90 |         raise ValueError(f'not supported {naming_scheme}')
 91 |     fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
 92 |     for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
 93 |         data = torch.load(f)
 94 |         # precision is n_iou, n_points, n_cat, n_area, max_det
 95 |         precision = data['precision']
 96 |         recall = data['params'].recThrs
 97 |         scores = data['scores']
 98 |         # take precision for all classes, all areas and 100 detections
 99 |         precision = precision[0, :, :, 0, -1].mean(1)
100 |         scores = scores[0, :, :, 0, -1].mean(1)
101 |         prec = precision.mean()
102 |         rec = data['recall'][0, :, 0, -1].mean()
103 |         print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
104 |               f'score={scores.mean():0.3f}, ' +
105 |               f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
106 |               )
107 |         axs[0].plot(recall, precision, c=color)
108 |         axs[1].plot(recall, scores, c=color)
109 | 
110 |     axs[0].set_title('Precision / Recall')
111 |     axs[0].legend(names)
112 |     axs[1].set_title('Scores / Recall')
113 |     axs[1].legend(names)
114 |     return fig, axs
115 | 
116 | 
117 | def draw_boxes(image: Tensor, boxes: Tensor, color=(0, 255, 0), texts=None) -> np.ndarray:
118 |     if isinstance(image, Tensor):
119 |         cv_image = image.detach().cpu().numpy()
120 |     else:
121 |         cv_image = image
122 |     if isinstance(boxes, Tensor):
123 |         cv_boxes = boxes.detach().cpu().numpy()
124 |     else:
125 |         cv_boxes = boxes
126 | 
127 |     tl = round(0.002 * max(image.shape[0:2])) + 1  # line thickness
128 |     tf = max(tl - 1, 1)
129 |     for i in range(len(boxes)):
130 |         box = cv_boxes[i]
131 |         x1, y1 = box[0:2]
132 |         x2, y2 = box[2:4]
133 |         cv2.rectangle(cv_image, (int(x1), int(y1)), (int(x2), int(y2)), color=color)
134 |         if texts is not None:
135 |             cv2.putText(cv_image, texts[i], (int(x1), int(y1+10)), 0, tl/3, [225, 255, 255],
136 |                         thickness=tf,
137 |                         lineType=cv2.LINE_AA)
138 |     return cv_image
139 | 
140 | 
141 | def draw_ref_pts(image: Tensor, ref_pts: Tensor) -> np.ndarray:
142 |     if isinstance(image, Tensor):
143 |         cv_image = image.detach().cpu().numpy()
144 |     else:
145 |         cv_image = image
146 |     if isinstance(ref_pts, Tensor):
147 |         cv_pts = ref_pts.detach().cpu().numpy()
148 |     else:
149 |         cv_pts = ref_pts
150 |     for i in range(len(cv_pts)):
151 |         x, y, is_pos = cv_pts[i]
152 |         color = (0, 1, 0) if is_pos else (1, 1, 1)
153 |         cv2.circle(cv_image, (int(x), int(y)), 2, color)
154 |     return cv_image
155 | 
156 | 
157 | def image_hwc2chw(image: np.ndarray):
158 |     image = np.ascontiguousarray(image.transpose(2, 0, 1))
159 |     return image
160 | 


--------------------------------------------------------------------------------
/models/network_blocks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | 
  9 | class SiLU(nn.Module):
 10 |     """export-friendly version of nn.SiLU()"""
 11 | 
 12 |     @staticmethod
 13 |     def forward(x):
 14 |         return x * torch.sigmoid(x)
 15 | 
 16 | 
 17 | def get_activation(name="silu", inplace=True):
 18 |     if name == "silu":
 19 |         # module = nn.SiLU(inplace=inplace)
 20 |         module = SiLU()
 21 |     elif name == "relu":
 22 |         module = nn.ReLU(inplace=inplace)
 23 |     elif name == "lrelu":
 24 |         module = nn.LeakyReLU(0.1, inplace=inplace)
 25 |     else:
 26 |         raise AttributeError("Unsupported act type: {}".format(name))
 27 |     return module
 28 | 
 29 | 
 30 | class BaseConv(nn.Module):
 31 |     """A Conv2d -> Batchnorm -> silu/leaky relu block"""
 32 | 
 33 |     def __init__(
 34 |         self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
 35 |     ):
 36 |         super().__init__()
 37 |         # same padding
 38 |         pad = (ksize - 1) // 2
 39 |         self.conv = nn.Conv2d(
 40 |             in_channels,
 41 |             out_channels,
 42 |             kernel_size=ksize,
 43 |             stride=stride,
 44 |             padding=pad,
 45 |             groups=groups,
 46 |             bias=bias,
 47 |         )
 48 |         self.bn = nn.BatchNorm2d(out_channels)
 49 |         self.act = get_activation(act, inplace=True)
 50 | 
 51 |     def forward(self, x):
 52 |         return self.act(self.bn(self.conv(x)))
 53 | 
 54 |     def fuseforward(self, x):
 55 |         return self.act(self.conv(x))
 56 | 
 57 | 
 58 | class DWConv(nn.Module):
 59 |     """Depthwise Conv + Conv"""
 60 | 
 61 |     def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
 62 |         super().__init__()
 63 |         self.dconv = BaseConv(
 64 |             in_channels,
 65 |             in_channels,
 66 |             ksize=ksize,
 67 |             stride=stride,
 68 |             groups=in_channels,
 69 |             act=act,
 70 |         )
 71 |         self.pconv = BaseConv(
 72 |             in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
 73 |         )
 74 | 
 75 |     def forward(self, x):
 76 |         x = self.dconv(x)
 77 |         return self.pconv(x)
 78 | 
 79 | 
 80 | class Bottleneck(nn.Module):
 81 |     # Standard bottleneck
 82 |     def __init__(
 83 |         self,
 84 |         in_channels,
 85 |         out_channels,
 86 |         shortcut=True,
 87 |         expansion=0.5,
 88 |         depthwise=False,
 89 |         act="silu",
 90 |     ):
 91 |         super().__init__()
 92 |         hidden_channels = int(out_channels * expansion)
 93 |         Conv = DWConv if depthwise else BaseConv
 94 |         self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
 95 |         self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
 96 |         self.use_add = shortcut and in_channels == out_channels
 97 | 
 98 |     def forward(self, x):
 99 |         y = self.conv2(self.conv1(x))
100 |         if self.use_add:
101 |             y = y + x
102 |         return y
103 | 
104 | 
105 | class ResLayer(nn.Module):
106 |     "Residual layer with `in_channels` inputs."
107 | 
108 |     def __init__(self, in_channels: int):
109 |         super().__init__()
110 |         mid_channels = in_channels // 2
111 |         self.layer1 = BaseConv(
112 |             in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
113 |         )
114 |         self.layer2 = BaseConv(
115 |             mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
116 |         )
117 | 
118 |     def forward(self, x):
119 |         out = self.layer2(self.layer1(x))
120 |         return x + out
121 | 
122 | 
123 | class SPPBottleneck(nn.Module):
124 |     """Spatial pyramid pooling layer used in YOLOv3-SPP"""
125 | 
126 |     def __init__(
127 |         self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
128 |     ):
129 |         super().__init__()
130 |         hidden_channels = in_channels // 2
131 |         self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
132 |         self.m = nn.ModuleList(
133 |             [
134 |                 nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
135 |                 for ks in kernel_sizes
136 |             ]
137 |         )
138 |         conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
139 |         self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
140 | 
141 |     def forward(self, x):
142 |         x = self.conv1(x)
143 |         x = torch.cat([x] + [m(x) for m in self.m], dim=1)
144 |         x = self.conv2(x)
145 |         return x
146 | 
147 | 
148 | class CSPLayer(nn.Module):
149 |     """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
150 | 
151 |     def __init__(
152 |         self,
153 |         in_channels,
154 |         out_channels,
155 |         n=1,
156 |         shortcut=True,
157 |         expansion=0.5,
158 |         depthwise=False,
159 |         act="silu",
160 |     ):
161 |         """
162 |         Args:
163 |             in_channels (int): input channels.
164 |             out_channels (int): output channels.
165 |             n (int): number of Bottlenecks. Default value: 1.
166 |         """
167 |         # ch_in, ch_out, number, shortcut, groups, expansion
168 |         super().__init__()
169 |         hidden_channels = int(out_channels * expansion)  # hidden channels
170 |         self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
171 |         self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
172 |         self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
173 |         module_list = [
174 |             Bottleneck(
175 |                 hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
176 |             )
177 |             for _ in range(n)
178 |         ]
179 |         self.m = nn.Sequential(*module_list)
180 | 
181 |     def forward(self, x):
182 |         x_1 = self.conv1(x)
183 |         x_2 = self.conv2(x)
184 |         x_1 = self.m(x_1)
185 |         x = torch.cat((x_1, x_2), dim=1)
186 |         return self.conv3(x)
187 | 
188 | 
189 | class Focus(nn.Module):
190 |     """Focus width and height information into channel space."""
191 | 
192 |     def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
193 |         super().__init__()
194 |         self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
195 | 
196 |     def forward(self, x):
197 |         # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
198 |         patch_top_left = x[..., ::2, ::2]
199 |         patch_top_right = x[..., ::2, 1::2]
200 |         patch_bot_left = x[..., 1::2, ::2]
201 |         patch_bot_right = x[..., 1::2, 1::2]
202 |         x = torch.cat(
203 |             (
204 |                 patch_top_left,
205 |                 patch_bot_left,
206 |                 patch_top_right,
207 |                 patch_bot_right,
208 |             ),
209 |             dim=1,
210 |         )
211 |         return self.conv(x)
212 | 


--------------------------------------------------------------------------------
/models/dino/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | 
 13 | import warnings
 14 | import math
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.nn.functional as F
 19 | from torch.nn.init import xavier_uniform_, constant_
 20 | 
 21 | from ..functions import MSDeformAttnFunction
 22 | 
 23 | 
 24 | def _is_power_of_2(n):
 25 |     if (not isinstance(n, int)) or (n < 0):
 26 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 27 |     return (n & (n-1) == 0) and n != 0
 28 | 
 29 | 
 30 | class MSDeformAttn(nn.Module):
 31 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 32 |         """
 33 |         Multi-Scale Deformable Attention Module
 34 |         :param d_model      hidden dimension
 35 |         :param n_levels     number of feature levels
 36 |         :param n_heads      number of attention heads
 37 |         :param n_points     number of sampling points per attention head per feature level
 38 |         """
 39 |         super().__init__()
 40 |         if d_model % n_heads != 0:
 41 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 42 |         _d_per_head = d_model // n_heads
 43 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 44 |         if not _is_power_of_2(_d_per_head):
 45 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 46 |                           "which is more efficient in our CUDA implementation.")
 47 | 
 48 |         self.im2col_step = 64
 49 | 
 50 |         self.d_model = d_model
 51 |         self.n_levels = n_levels
 52 |         self.n_heads = n_heads
 53 |         self.n_points = n_points
 54 | 
 55 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 56 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 57 |         self.value_proj = nn.Linear(d_model, d_model)
 58 |         self.output_proj = nn.Linear(d_model, d_model)
 59 | 
 60 |         self._reset_parameters()
 61 | 
 62 |     def _reset_parameters(self):
 63 |         constant_(self.sampling_offsets.weight.data, 0.)
 64 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 65 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 66 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 67 |         for i in range(self.n_points):
 68 |             grid_init[:, :, i, :] *= i + 1
 69 |         with torch.no_grad():
 70 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 71 |         constant_(self.attention_weights.weight.data, 0.)
 72 |         constant_(self.attention_weights.bias.data, 0.)
 73 |         xavier_uniform_(self.value_proj.weight.data)
 74 |         constant_(self.value_proj.bias.data, 0.)
 75 |         xavier_uniform_(self.output_proj.weight.data)
 76 |         constant_(self.output_proj.bias.data, 0.)
 77 | 
 78 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 79 |         """
 80 |         :param query                       (N, Length_{query}, C)
 81 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 82 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 83 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 84 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 85 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 86 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 87 | 
 88 |         :return output                     (N, Length_{query}, C)
 89 |         """
 90 |         N, Len_q, _ = query.shape
 91 |         N, Len_in, _ = input_flatten.shape
 92 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 93 | 
 94 |         value = self.value_proj(input_flatten)
 95 |         if input_padding_mask is not None:
 96 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 97 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 98 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
 99 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
100 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
101 |         # N, Len_q, n_heads, n_levels, n_points, 2
102 |         if reference_points.shape[-1] == 2:
103 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
104 |             sampling_locations = reference_points[:, :, None, :, None, :] \
105 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
106 |         elif reference_points.shape[-1] == 4:
107 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
108 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
109 |         else:
110 |             raise ValueError(
111 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
112 | 
113 |         # for amp
114 |         if value.dtype == torch.float16:
115 |             # for mixed precision
116 |             output = MSDeformAttnFunction.apply(
117 |             value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
118 |             output = output.to(torch.float16)
119 |             output = self.output_proj(output)
120 |             return output
121 | 
122 | 
123 |         output = MSDeformAttnFunction.apply(
124 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
125 |         output = self.output_proj(output)
126 |         return output
127 | 


--------------------------------------------------------------------------------
/models/dino/utils.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # DINO
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | 
  7 | import torch
  8 | from torch import nn, Tensor
  9 | 
 10 | import math
 11 | import torch.nn.functional as F
 12 | from torch import nn
 13 | 
 14 | 
 15 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor, learnedwh=None):
 16 |     """
 17 |     Input:
 18 |         - memory: bs, \sum{hw}, d_model
 19 |         - memory_padding_mask: bs, \sum{hw}
 20 |         - spatial_shapes: nlevel, 2
 21 |         - learnedwh: 2
 22 |     Output:
 23 |         - output_memory: bs, \sum{hw}, d_model
 24 |         - output_proposals: bs, \sum{hw}, 4
 25 |     """
 26 |     N_, S_, C_ = memory.shape
 27 |     base_scale = 4.0
 28 |     proposals = []
 29 |     _cur = 0
 30 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
 31 |         mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
 32 |         valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
 33 |         valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 34 | 
 35 |         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
 36 |                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
 37 |         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
 38 | 
 39 |         scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
 40 |         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
 41 | 
 42 |         if learnedwh is not None:
 43 |             wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
 44 |         else:
 45 |             wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
 46 | 
 47 |         proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
 48 |         proposals.append(proposal)
 49 |         _cur += (H_ * W_)
 50 | 
 51 |     output_proposals = torch.cat(proposals, 1)
 52 |     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
 53 |     output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
 54 |     output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
 55 |     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
 56 | 
 57 |     output_memory = memory
 58 |     output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
 59 |     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
 60 | 
 61 |     return output_memory, output_proposals
 62 | 
 63 | 
 64 | class RandomBoxPerturber():
 65 |     def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
 66 |         self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
 67 | 
 68 |     def __call__(self, refanchors: Tensor) -> Tensor:
 69 |         nq, bs, query_dim = refanchors.shape
 70 |         device = refanchors.device
 71 | 
 72 |         noise_raw = torch.rand_like(refanchors)
 73 |         noise_scale = self.noise_scale.to(device)[:query_dim]
 74 | 
 75 |         new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
 76 |         return new_refanchors.clamp_(0, 1)
 77 | 
 78 | 
 79 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
 80 |     """
 81 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
 82 |     Args:
 83 |         inputs: A float tensor of arbitrary shape.
 84 |                 The predictions for each example.
 85 |         targets: A float tensor with the same shape as inputs. Stores the binary
 86 |                  classification label for each element in inputs
 87 |                 (0 for the negative class and 1 for the positive class).
 88 |         alpha: (optional) Weighting factor in range (0,1) to balance
 89 |                 positive vs negative examples. Default = -1 (no weighting).
 90 |         gamma: Exponent of the modulating factor (1 - p_t) to
 91 |                balance easy vs hard examples.
 92 |     Returns:
 93 |         Loss tensor
 94 |     """
 95 |     prob = inputs.sigmoid()
 96 |     ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
 97 |     p_t = prob * targets + (1 - prob) * (1 - targets)
 98 |     loss = ce_loss * ((1 - p_t) ** gamma)
 99 | 
100 |     if alpha >= 0:
101 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
102 |         loss = alpha_t * loss
103 | 
104 |     return loss.mean(1).sum() / num_boxes
105 | 
106 | 
107 | class MLP(nn.Module):
108 |     """ Very simple multi-layer perceptron (also called FFN)"""
109 | 
110 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
111 |         super().__init__()
112 |         self.num_layers = num_layers
113 |         h = [hidden_dim] * (num_layers - 1)
114 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
115 | 
116 |     def forward(self, x):
117 |         for i, layer in enumerate(self.layers):
118 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
119 |         return x
120 | 
121 | 
122 | def _get_activation_fn(activation, d_model=256, batch_dim=0):
123 |     """Return an activation function given a string"""
124 |     if activation == "relu":
125 |         return F.relu
126 |     if activation == "gelu":
127 |         return F.gelu
128 |     if activation == "glu":
129 |         return F.glu
130 |     if activation == "prelu":
131 |         return nn.PReLU()
132 |     if activation == "selu":
133 |         return F.selu
134 | 
135 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
136 | 
137 | 
138 | def gen_sineembed_for_position(pos_tensor):
139 |     # n_query, bs, _ = pos_tensor.size()
140 |     # sineembed_tensor = torch.zeros(n_query, bs, 256)
141 |     scale = 2 * math.pi
142 |     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
143 |     dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128)
144 |     x_embed = pos_tensor[:, :, 0] * scale
145 |     y_embed = pos_tensor[:, :, 1] * scale
146 |     pos_x = x_embed[:, :, None] / dim_t
147 |     pos_y = y_embed[:, :, None] / dim_t
148 |     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
149 |     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
150 |     if pos_tensor.size(-1) == 2:
151 |         pos = torch.cat((pos_y, pos_x), dim=2)
152 |     elif pos_tensor.size(-1) == 4:
153 |         w_embed = pos_tensor[:, :, 2] * scale
154 |         pos_w = w_embed[:, :, None] / dim_t
155 |         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
156 | 
157 |         h_embed = pos_tensor[:, :, 3] * scale
158 |         pos_h = h_embed[:, :, None] / dim_t
159 |         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
160 | 
161 |         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
162 |     else:
163 |         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
164 |     return pos


--------------------------------------------------------------------------------
/models/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
 25 | features_grad=0.0
 26 | 
 27 | def _is_power_of_2(n):
 28 |     if (not isinstance(n, int)) or (n < 0):
 29 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 30 |     return (n & (n-1) == 0) and n != 0
 31 | 
 32 | 
 33 | class MSDeformAttn(nn.Module):
 34 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64):
 35 |         """
 36 |         Multi-Scale Deformable Attention Module
 37 |         :param d_model      hidden dimension
 38 |         :param n_levels     number of feature levels
 39 |         :param n_heads      number of attention heads
 40 |         :param n_points     number of sampling points per attention head per feature level
 41 |         """
 42 |         super().__init__()
 43 |         if d_model % n_heads != 0:
 44 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 45 |         _d_per_head = d_model // n_heads
 46 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 47 |         if not _is_power_of_2(_d_per_head):
 48 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 49 |                           "which is more efficient in our CUDA implementation.")
 50 | 
 51 |         self.im2col_step = im2col_step
 52 |         self.sigmoid_attn = sigmoid_attn
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value.masked_fill_(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         if self.sigmoid_attn:
105 |             attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
106 |         else:
107 |             attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
108 |         # N, Len_q, n_heads, n_levels, n_points, 2
109 |         if reference_points.shape[-1] == 2:
110 |             sampling_locations = reference_points[:, :, None, :, None, :] \
111 |                                  + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)]
112 |         elif reference_points.shape[-1] == 4:
113 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
114 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
115 |         else:
116 |             raise ValueError(
117 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
118 |         
119 |         # def extract(g):
120 |         #     global features_grad
121 |         #     features_grad = g
122 |         # value.requires_grad=True
123 |         # value.register_hook(extract)
124 |         
125 |         output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
126 |         # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float()
127 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
128 | 
129 |         output = self.output_proj(output)
130 |         return output
131 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-3.8/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 megvii-research. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
 25 | features_grad=0.0
 26 | 
 27 | def _is_power_of_2(n):
 28 |     if (not isinstance(n, int)) or (n < 0):
 29 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 30 |     return (n & (n-1) == 0) and n != 0
 31 | 
 32 | 
 33 | class MSDeformAttn(nn.Module):
 34 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, sigmoid_attn=False, im2col_step=64):
 35 |         """
 36 |         Multi-Scale Deformable Attention Module
 37 |         :param d_model      hidden dimension
 38 |         :param n_levels     number of feature levels
 39 |         :param n_heads      number of attention heads
 40 |         :param n_points     number of sampling points per attention head per feature level
 41 |         """
 42 |         super().__init__()
 43 |         if d_model % n_heads != 0:
 44 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 45 |         _d_per_head = d_model // n_heads
 46 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 47 |         if not _is_power_of_2(_d_per_head):
 48 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 49 |                           "which is more efficient in our CUDA implementation.")
 50 | 
 51 |         self.im2col_step = im2col_step
 52 |         self.sigmoid_attn = sigmoid_attn
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value.masked_fill_(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         if self.sigmoid_attn:
105 |             attention_weights = attention_weights.sigmoid().view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
106 |         else:
107 |             attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
108 |         # N, Len_q, n_heads, n_levels, n_points, 2
109 |         if reference_points.shape[-1] == 2:
110 |             sampling_locations = reference_points[:, :, None, :, None, :] \
111 |                                  + sampling_offsets / input_spatial_shapes[None, None, None, :, None, (1, 0)]
112 |         elif reference_points.shape[-1] == 4:
113 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
114 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
115 |         else:
116 |             raise ValueError(
117 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
118 |         
119 |         # def extract(g):
120 |         #     global features_grad
121 |         #     features_grad = g
122 |         # value.requires_grad=True
123 |         # value.register_hook(extract)
124 |         
125 |         output = MSDeformAttnFunction.apply(value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
126 |         # output = MSDeformAttnFunction.apply(value.double(), input_spatial_shapes, input_level_start_index, sampling_locations.double(), attention_weights.double(), self.im2col_step).float()
127 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
128 | 
129 |         output = self.output_proj(output)
130 |         return output
131 | 


--------------------------------------------------------------------------------