├── INSTALL.md
├── LICENSE
├── README.md
├── USAGE.md
├── __init__.py
├── cocoapi_hq
├── .travis.yml
├── LuaAPI
│ ├── CocoApi.lua
│ ├── MaskApi.lua
│ ├── cocoDemo.lua
│ ├── env.lua
│ ├── init.lua
│ └── rocks
│ │ └── coco-scm-1.rockspec
├── MatlabAPI
│ ├── CocoApi.m
│ ├── CocoEval.m
│ ├── CocoUtils.m
│ ├── MaskApi.m
│ ├── cocoDemo.m
│ ├── evalDemo.m
│ ├── gason.m
│ └── private
│ │ ├── gasonMex.cpp
│ │ ├── gasonMex.mexa64
│ │ ├── gasonMex.mexmaci64
│ │ ├── getPrmDflt.m
│ │ └── maskApiMex.c
├── PythonAPI
│ ├── Makefile
│ ├── pycocoDemo.ipynb
│ ├── pycocoEvalDemo.ipynb
│ ├── pycocotools.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ ├── requires.txt
│ │ └── top_level.txt
│ ├── pycocotools
│ │ ├── __init__.py
│ │ ├── _mask.c
│ │ ├── _mask.pyx
│ │ ├── boundary_utils.py
│ │ ├── coco.py
│ │ ├── cocoeval.py
│ │ ├── mask.py
│ │ ├── ytvos.py
│ │ └── ytvoseval.py
│ └── setup.py
├── README.md
├── README.txt
├── common
│ ├── gason.cpp
│ ├── gason.h
│ ├── maskApi.c
│ └── maskApi.h
├── license.txt
└── results
│ ├── captions_val2014_fakecap_results.json
│ ├── instances_val2014_fakebbox100_results.json
│ ├── instances_val2014_fakesegm100_results.json
│ ├── person_keypoints_val2014_fakekeypoints100_results.json
│ └── val2014_fake_eval_res.txt
├── datasets
├── __init__.py
├── coco.py
├── coco2seq.py
├── coco_eval.py
├── coco_panoptic.py
├── concat_dataset.py
├── data_prefetcher.py
├── image_to_seq_augmenter.py
├── panoptic_eval.py
├── samplers.py
├── torchvision_datasets
│ ├── __init__.py
│ └── coco.py
├── transforms.py
├── transforms_clip.py
└── ytvos.py
├── eval_hqvis.py
├── figures
├── data1_new.gif
├── dataset_compare_s.png
├── result_demo1.gif
└── vmt_banner_img.png
├── models
├── __init__.py
├── backbone.py
├── deformable_transformer.py
├── matcher.py
├── ops
│ ├── MultiScaleDeformableAttention.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── functions
│ │ ├── __init__.py
│ │ └── ms_deform_attn_func.py
│ ├── make.sh
│ ├── modules
│ │ ├── __init__.py
│ │ └── ms_deform_attn.py
│ ├── setup.py
│ ├── src
│ │ ├── cpu
│ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ └── ms_deform_attn_cpu.h
│ │ ├── cuda
│ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ ├── ms_deform_attn_cuda.h
│ │ │ └── ms_deform_im2col_cuda.cuh
│ │ ├── ms_deform_attn.h
│ │ └── vision.cpp
│ └── test.py
├── position_encoding.py
├── segmentation.py
├── swin_transformer.py
├── vmt.py
└── x101_64d.py
├── models_swin
├── __init__.py
├── backbone.py
├── deformable_transformer.py
├── matcher.py
├── ops
│ ├── MultiScaleDeformableAttention.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ └── top_level.txt
│ ├── functions
│ │ ├── __init__.py
│ │ └── ms_deform_attn_func.py
│ ├── make.sh
│ ├── modules
│ │ ├── __init__.py
│ │ └── ms_deform_attn.py
│ ├── setup.py
│ ├── src
│ │ ├── cpu
│ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ └── ms_deform_attn_cpu.h
│ │ ├── cuda
│ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ ├── ms_deform_attn_cuda.h
│ │ │ └── ms_deform_im2col_cuda.cuh
│ │ ├── ms_deform_attn.h
│ │ └── vision.cpp
│ └── test.py
├── position_encoding.py
├── segmentation.py
├── swin_transformer.py
├── vmt.py
└── x101_64d.py
├── requirements.txt
├── scripts
├── eval_r101_test.sh
├── eval_r101_val.sh
├── eval_r50_test.sh
├── eval_r50_val.sh
├── eval_swin_test.sh
├── eval_swin_val.sh
└── eval_swin_val_vis.sh
├── tools
├── __init__.py
├── inference.py
├── inference_swin.py
├── inference_swin_test.py
├── inference_swin_with_vis.py
├── inference_test.py
├── inference_with_vis.py
└── visualizer.py
└── util
├── __init__.py
├── box_ops.py
├── misc.py
└── plot_utils.py
/INSTALL.md:
--------------------------------------------------------------------------------
1 | ### Installation
2 |
3 | First, clone the repository locally:
4 |
5 | ```bash
6 | conda create -n vmt python=3.7 -y
7 |
8 | conda activate vmt
9 |
10 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 -c pytorch
11 |
12 | git clone --recursive https://github.com/SysCV/vmt.git
13 | ```
14 |
15 | Install detectron2 for visualization under your working directory:
16 | ```
17 | git clone https://github.com/facebookresearch/detectron2.git
18 | cd detectron2
19 | pip install -e .
20 | ```
21 |
22 | Install dependencies and pycocotools for VIS and HQ-YTVIS:
23 | ```bash
24 | pip install -r requirements.txt
25 |
26 | cd cocoapi_hq/PythonAPI
27 | # To compile and install locally
28 | python setup.py build_ext --inplace
29 | # To install library to Python site-packages
30 | python setup.py build_ext install
31 | ```
32 |
33 | Compiling CUDA operators:
34 |
35 | ```bash
36 | cd ./models/ops
37 | sh ./make.sh
38 | # unit test (should see all checking is True)
39 | python test.py
40 |
41 | cd ./models_swin/ops
42 | sh ./make.sh
43 | ```
44 |
45 | ### Data Preparation
46 |
47 | Download and extract 2019 version of YoutubeVIS train and val images with annotations from [YouTubeVIS](https://youtube-vos.org/dataset/vis/), and download [HQ-YTVIS annotations](https://www.vis.xyz/data/hqvis/) and COCO 2017 datasets. We expect the directory structure to be the following:
48 |
49 |
50 | ```
51 | vmt
52 | ├── datasets
53 | │ ├── coco_keepfor_ytvis19_new.json
54 | ...
55 | ytvis
56 | ├── train
57 | ├── val
58 | ├── annotations
59 | │ ├── instances_train_sub.json
60 | │ ├── instances_val_sub.json
61 | │ ├── ytvis_hq-train.json
62 | │ ├── ytvis_hq-val.json
63 | │ ├── ytvis_hq-test.json
64 | coco
65 | ├── train2017
66 | ├── val2017
67 | ├── annotations
68 | │ ├── instances_train2017.json
69 | │ ├── instances_val2017.json
70 | ```
71 |
72 | The modified coco annotations 'coco_keepfor_ytvis19_new.json' for joint training can be downloaded from [[google]](https://drive.google.com/file/d/18yKpc8wt7xJK26QFpR5Xa0vjM5HN6ieg/view?usp=sharing). The HQ-YTVIS annotations can be downloaded from [[google]](https://drive.google.com/drive/folders/1ZU8_qO8HnJ_-vvxIAn8-_kJ4xtOdkefh?usp=sharing).
73 |
74 | ##
75 |
76 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Video Mask Transfiner
2 | Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022]
3 |
4 | [[Project Page](https://www.vis.xyz/pub/vmt/) | [Dataset Page](https://www.vis.xyz/data/hqvis/) | [Paper](https://arxiv.org/abs/2207.14012)\]
5 |
6 | > [**Video Mask Transfiner for High-Quality Video Instance Segmentation**](http://arxiv.org/abs/2207.14012),
7 | > Lei Ke, Henghui Ding, Martin Danelljan, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu
8 | > *ECCV 2022 ([arXiv 2207.14012](https://arxiv.org/abs/2207.14012))*
9 |
10 |
11 |
12 | ## HQ-YTVIS: High-Quality Video Instance Segmentation Dataset
13 | Mask annotation comparison between **Youtube-VIS** and **HQ-YTVIS**. HQ-YTVIS serves as a new benchmark to facilitate future development (training & evaluation) of VIS methods aiming at higher mask quality.
14 |
15 |
16 |
17 | https://user-images.githubusercontent.com/17427852/181796696-bfe9a9dd-2d39-42a2-b218-283c210e5ffd.mp4
18 |
19 | Mask annotations in **Youtube-VIS** (Left Video) vs. Mask annotations in **HQ-YTVIS** (Right Video). Please visit our [Dataset Page](https://www.vis.xyz/data/hqvis/) for detailed descriptions of using HQ-YTVIS benchmark.
20 |
21 | **Dataset Download:** [HQ-YTVIS Annotation Link](https://drive.google.com/drive/folders/1ZU8_qO8HnJ_-vvxIAn8-_kJ4xtOdkefh?usp=sharing)\
22 | **Dataset Usage:** replace our annotation json to original YTVIS annotation files.
23 |
24 | ## HQ-YTVIS Evaluation API
25 | Please refer to our [Installation Guidance](cocoapi_hq/) and [Tube-Mask AP & Tube-Boundary AP Usage Example](eval_hqvis.py).
26 |
27 | ```
28 | python eval_hqvis.py --save-path prediction_results.json
29 | ```
30 |
31 | ## VMT Code (under construction)
32 |
33 | ### Install
34 |
35 | Please refer to [INSTALL.md](INSTALL.md) for installation instructions and dataset preparation.
36 |
37 | ### Usages
38 |
39 | Please refer to [USAGE.md](USAGE.md) for dataset preparation and detailed running (including testing, visualization, etc.) instructions.
40 |
41 | https://user-images.githubusercontent.com/17427852/181796768-3e79ee74-2465-4af8-ba89-b5c837098e00.mp4
42 |
43 | ### Model zoo on HQ-YTVIS model
44 |
45 | Train on [HQ-YTVIS](https://www.vis.xyz/data/hqvis/) **train** set and COCO, evaluate on [HQ-YTVIS](https://www.vis.xyz/data/hqvis/) **test** set.
46 |
47 | APB: Tube-Boundary AP (proposed in Eq.1 of the paper)
48 |
49 | APM: Tube-Mask AP (proposed in YTVIS paper)
50 |
51 | | Model | APB | APB75 | ARB1 | APM | ARM75 | download |
52 | | ------------------------------------------------------------ | ---- | ---- | ---- | ---- | ---- | ------------------------------------------------------------ |
53 | | VMT_r50 | 30.7 | 24.2 | 31.5 | 50.5 | 54.5 | [weight](https://drive.google.com/file/d/1e9hKCC-pAGB-wSO0_qyUNoEe-5XzRocz/view?usp=sharing) |
54 | | VMT_r101 | 33.0 | 29.3 | 33.3 | 51.6 | 55.8 | [weight](https://drive.google.com/file/d/1TQs_meDaomLz56xCjAZKT1BNtS3K3sla/view?usp=sharing) |
55 | | VMT_swin_L | 44.8 | 43.4 | 43.0 | 64.8 | 70.1 | [weight](https://drive.google.com/file/d/13cDni9olYd6-xdURQMWstsW0VLbkgIKt/view?usp=sharing) |
56 |
57 | ### Citation
58 |
59 | ```bibtex
60 | @inproceedings{vmt,
61 | title = {Video Mask Transfiner for High-Quality Video Instance Segmentation},
62 | author = {Ke, Lei and Ding, Henghui and Danelljan, Martin and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher},
63 | booktitle = {European Conference on Computer Vision (ECCV)},
64 | year = {2022}
65 | }
66 |
67 | @inproceedings{transfiner,
68 | title={Mask Transfiner for High-Quality Instance Segmentation},
69 | author={Ke, Lei and Danelljan, Martin and Li, Xia and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher},
70 | booktitle = {CVPR},
71 | year = {2022}
72 | }
73 | ```
74 |
75 | ## Acknowledgement
76 | We thank [Mask Transfiner](https://github.com/SysCV/transfiner) and [SeqFormer](https://github.com/wjf5203/SeqFormer) for their open source codes.
77 |
--------------------------------------------------------------------------------
/USAGE.md:
--------------------------------------------------------------------------------
1 | ### Pretrained Models
2 |
3 | Download the pretrained models from the Model zoo table:
4 | ```
5 | mkdir pretrained_model
6 | #And put the downloaded pretrained models in this directory.
7 | ```
8 |
9 | ### Inference & Evaluation on HQ-YTVIS
10 |
11 | Refer to our [scripts folder](./scripts) for more commands:
12 |
13 | Evaluating on HQ-YTVIS test:
14 | ```
15 | bash scripts/eval_swin_test.sh
16 | ```
17 | or
18 | ```
19 | bash scripts/eval_r101_test.sh
20 | ```
21 |
22 | ### Results Visualization
23 |
24 | ```
25 | bash scripts/eval_swin_val_vis.sh
26 | ```
27 | or
28 | ```
29 | python3 -m tools.inference_swin_with_vis --masks --backbone swin_l_p4w12 --output vis_output_swin_vmt --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json --save-frames True
30 | ```
31 |
32 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/__init__.py
--------------------------------------------------------------------------------
/cocoapi_hq/.travis.yml:
--------------------------------------------------------------------------------
1 | group: travis_latest
2 | language: python
3 | cache: pip
4 | python:
5 | - 2.7
6 | - 3.6
7 | install:
8 | - pip install --upgrade pip
9 | - pip install pycocotools
10 | script:
11 | - true
12 |
--------------------------------------------------------------------------------
/cocoapi_hq/LuaAPI/cocoDemo.lua:
--------------------------------------------------------------------------------
1 | -- Demo for the CocoApi (see CocoApi.lua)
2 | coco = require 'coco'
3 | image = require 'image'
4 |
5 | -- initialize COCO api (please specify dataType/annType below)
6 | annTypes = { 'instances', 'captions', 'person_keypoints' }
7 | dataType, annType = 'val2014', annTypes[1]; -- specify dataType/annType
8 | annFile = '../annotations/'..annType..'_'..dataType..'.json'
9 | cocoApi=coco.CocoApi(annFile)
10 |
11 | -- get all image ids, select one at random
12 | imgIds = cocoApi:getImgIds()
13 | imgId = imgIds[torch.random(imgIds:numel())]
14 |
15 | -- load image
16 | img = cocoApi:loadImgs(imgId)[1]
17 | I = image.load('../images/'..dataType..'/'..img.file_name,3)
18 |
19 | -- load and display instance annotations
20 | annIds = cocoApi:getAnnIds({imgId=imgId})
21 | anns = cocoApi:loadAnns(annIds)
22 | J = cocoApi:showAnns(I,anns)
23 | image.save('RES_'..img.file_name,J:double())
24 |
--------------------------------------------------------------------------------
/cocoapi_hq/LuaAPI/env.lua:
--------------------------------------------------------------------------------
1 | --[[----------------------------------------------------------------------------
2 |
3 | Common Objects in COntext (COCO) Toolbox. version 3.0
4 | Data, paper, and tutorials available at: http://mscoco.org/
5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016.
6 | Licensed under the Simplified BSD License [see coco/license.txt]
7 |
8 | ------------------------------------------------------------------------------]]
9 |
10 | local coco = {}
11 | return coco
12 |
--------------------------------------------------------------------------------
/cocoapi_hq/LuaAPI/init.lua:
--------------------------------------------------------------------------------
1 | --[[----------------------------------------------------------------------------
2 |
3 | Common Objects in COntext (COCO) Toolbox. version 3.0
4 | Data, paper, and tutorials available at: http://mscoco.org/
5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016.
6 | Licensed under the Simplified BSD License [see coco/license.txt]
7 |
8 | ------------------------------------------------------------------------------]]
9 |
10 | local coco = require 'coco.env'
11 | require 'coco.CocoApi'
12 | require 'coco.MaskApi'
13 | return coco
14 |
--------------------------------------------------------------------------------
/cocoapi_hq/LuaAPI/rocks/coco-scm-1.rockspec:
--------------------------------------------------------------------------------
1 | package = "coco"
2 | version = "scm-1"
3 |
4 | source = {
5 | url = "git://github.com/pdollar/coco.git"
6 | }
7 |
8 | description = {
9 | summary = "Interface for accessing the Microsoft COCO dataset",
10 | detailed = "See http://mscoco.org/ for more details",
11 | homepage = "https://github.com/pdollar/coco",
12 | license = "Simplified BSD"
13 | }
14 |
15 | dependencies = {
16 | "lua >= 5.1",
17 | "torch >= 7.0",
18 | "lua-cjson"
19 | }
20 |
21 | build = {
22 | type = "builtin",
23 | modules = {
24 | ["coco.env"] = "LuaAPI/env.lua",
25 | ["coco.init"] = "LuaAPI/init.lua",
26 | ["coco.MaskApi"] = "LuaAPI/MaskApi.lua",
27 | ["coco.CocoApi"] = "LuaAPI/CocoApi.lua",
28 | libmaskapi = {
29 | sources = { "common/maskApi.c" },
30 | incdirs = { "common/" }
31 | }
32 | }
33 | }
34 |
35 | -- luarocks make LuaAPI/rocks/coco-scm-1.rockspec
36 | -- https://github.com/pdollar/coco/raw/master/LuaAPI/rocks/coco-scm-1.rockspec
37 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/MaskApi.m:
--------------------------------------------------------------------------------
1 | classdef MaskApi
2 | % Interface for manipulating masks stored in RLE format.
3 | %
4 | % RLE is a simple yet efficient format for storing binary masks. RLE
5 | % first divides a vector (or vectorized image) into a series of piecewise
6 | % constant regions and then for each piece simply stores the length of
7 | % that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
8 | % be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
9 | % (note that the odd counts are always the numbers of zeros). Instead of
10 | % storing the counts directly, additional compression is achieved with a
11 | % variable bitrate representation based on a common scheme called LEB128.
12 | %
13 | % Compression is greatest given large piecewise constant regions.
14 | % Specifically, the size of the RLE is proportional to the number of
15 | % *boundaries* in M (or for an image the number of boundaries in the y
16 | % direction). Assuming fairly simple shapes, the RLE representation is
17 | % O(sqrt(n)) where n is number of pixels in the object. Hence space usage
18 | % is substantially lower, especially for large simple objects (large n).
19 | %
20 | % Many common operations on masks can be computed directly using the RLE
21 | % (without need for decoding). This includes computations such as area,
22 | % union, intersection, etc. All of these operations are linear in the
23 | % size of the RLE, in other words they are O(sqrt(n)) where n is the area
24 | % of the object. Computing these operations on the original mask is O(n).
25 | % Thus, using the RLE can result in substantial computational savings.
26 | %
27 | % The following API functions are defined:
28 | % encode - Encode binary masks using RLE.
29 | % decode - Decode binary masks encoded via RLE.
30 | % merge - Compute union or intersection of encoded masks.
31 | % iou - Compute intersection over union between masks.
32 | % nms - Compute non-maximum suppression between ordered masks.
33 | % area - Compute area of encoded masks.
34 | % toBbox - Get bounding boxes surrounding encoded masks.
35 | % frBbox - Convert bounding boxes to encoded masks.
36 | % frPoly - Convert polygon to encoded mask.
37 | %
38 | % Usage:
39 | % Rs = MaskApi.encode( masks )
40 | % masks = MaskApi.decode( Rs )
41 | % R = MaskApi.merge( Rs, [intersect=false] )
42 | % o = MaskApi.iou( dt, gt, [iscrowd=false] )
43 | % keep = MaskApi.nms( dt, thr )
44 | % a = MaskApi.area( Rs )
45 | % bbs = MaskApi.toBbox( Rs )
46 | % Rs = MaskApi.frBbox( bbs, h, w )
47 | % R = MaskApi.frPoly( poly, h, w )
48 | %
49 | % In the API the following formats are used:
50 | % R,Rs - [struct] Run-length encoding of binary mask(s)
51 | % masks - [hxwxn] Binary mask(s) (must have type uint8)
52 | % bbs - [nx4] Bounding box(es) stored as [x y w h]
53 | % poly - Polygon stored as {[x1 y1 x2 y2...],[x1 y1 ...],...}
54 | % dt,gt - May be either bounding boxes or encoded masks
55 | % Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
56 | %
57 | % Finally, a note about the intersection over union (iou) computation.
58 | % The standard iou of a ground truth (gt) and detected (dt) object is
59 | % iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
60 | % For "crowd" regions, we use a modified criteria. If a gt object is
61 | % marked as "iscrowd", we allow a dt to match any subregion of the gt.
62 | % Choosing gt' in the crowd gt that best matches the dt can be done using
63 | % gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
64 | % iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
65 | % For crowd gt regions we use this modified criteria above for the iou.
66 | %
67 | % To compile use the following (some precompiled binaries are included):
68 | % mex('CFLAGS=\$CFLAGS -Wall -std=c99','-largeArrayDims',...
69 | % 'private/maskApiMex.c','../common/maskApi.c',...
70 | % '-I../common/','-outdir','private');
71 | % Please do not contact us for help with compiling.
72 | %
73 | % Microsoft COCO Toolbox. version 2.0
74 | % Data, paper, and tutorials available at: http://mscoco.org/
75 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
76 | % Licensed under the Simplified BSD License [see coco/license.txt]
77 |
78 | methods( Static )
79 | function Rs = encode( masks )
80 | Rs = maskApiMex( 'encode', masks );
81 | end
82 |
83 | function masks = decode( Rs )
84 | masks = maskApiMex( 'decode', Rs );
85 | end
86 |
87 | function R = merge( Rs, varargin )
88 | R = maskApiMex( 'merge', Rs, varargin{:} );
89 | end
90 |
91 | function o = iou( dt, gt, varargin )
92 | o = maskApiMex( 'iou', dt', gt', varargin{:} );
93 | end
94 |
95 | function keep = nms( dt, thr )
96 | keep = maskApiMex('nms',dt',thr);
97 | end
98 |
99 | function a = area( Rs )
100 | a = maskApiMex( 'area', Rs );
101 | end
102 |
103 | function bbs = toBbox( Rs )
104 | bbs = maskApiMex( 'toBbox', Rs )';
105 | end
106 |
107 | function Rs = frBbox( bbs, h, w )
108 | Rs = maskApiMex( 'frBbox', bbs', h, w );
109 | end
110 |
111 | function R = frPoly( poly, h, w )
112 | R = maskApiMex( 'frPoly', poly, h , w );
113 | end
114 | end
115 |
116 | end
117 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/cocoDemo.m:
--------------------------------------------------------------------------------
1 | %% Demo for the CocoApi (see CocoApi.m)
2 |
3 | %% initialize COCO api (please specify dataType/annType below)
4 | annTypes = { 'instances', 'captions', 'person_keypoints' };
5 | dataType='val2014'; annType=annTypes{1}; % specify dataType/annType
6 | annFile=sprintf('../annotations/%s_%s.json',annType,dataType);
7 | coco=CocoApi(annFile);
8 |
9 | %% display COCO categories and supercategories
10 | if( ~strcmp(annType,'captions') )
11 | cats = coco.loadCats(coco.getCatIds());
12 | nms={cats.name}; fprintf('COCO categories: ');
13 | fprintf('%s, ',nms{:}); fprintf('\n');
14 | nms=unique({cats.supercategory}); fprintf('COCO supercategories: ');
15 | fprintf('%s, ',nms{:}); fprintf('\n');
16 | end
17 |
18 | %% get all images containing given categories, select one at random
19 | catIds = coco.getCatIds('catNms',{'person','dog','skateboard'});
20 | imgIds = coco.getImgIds('catIds',catIds);
21 | imgId = imgIds(randi(length(imgIds)));
22 |
23 | %% load and display image
24 | img = coco.loadImgs(imgId);
25 | I = imread(sprintf('../images/%s/%s',dataType,img.file_name));
26 | figure(1); imagesc(I); axis('image'); set(gca,'XTick',[],'YTick',[])
27 |
28 | %% load and display annotations
29 | annIds = coco.getAnnIds('imgIds',imgId,'catIds',catIds,'iscrowd',[]);
30 | anns = coco.loadAnns(annIds); coco.showAnns(anns);
31 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/evalDemo.m:
--------------------------------------------------------------------------------
1 | %% Demo demonstrating the algorithm result formats for COCO
2 |
3 | %% select results type for demo (either bbox or segm)
4 | type = {'segm','bbox','keypoints'}; type = type{1}; % specify type here
5 | fprintf('Running demo for *%s* results.\n\n',type);
6 |
7 | %% initialize COCO ground truth api
8 | dataDir='../'; prefix='instances'; dataType='val2014';
9 | if(strcmp(type,'keypoints')), prefix='person_keypoints'; end
10 | annFile=sprintf('%s/annotations/%s_%s.json',dataDir,prefix,dataType);
11 | cocoGt=CocoApi(annFile);
12 |
13 | %% initialize COCO detections api
14 | resFile='%s/results/%s_%s_fake%s100_results.json';
15 | resFile=sprintf(resFile,dataDir,prefix,dataType,type);
16 | cocoDt=cocoGt.loadRes(resFile);
17 |
18 | %% visialuze gt and dt side by side
19 | imgIds=sort(cocoGt.getImgIds()); imgIds=imgIds(1:100);
20 | imgId = imgIds(randi(100)); img = cocoGt.loadImgs(imgId);
21 | I = imread(sprintf('%s/images/val2014/%s',dataDir,img.file_name));
22 | figure(1); subplot(1,2,1); imagesc(I); axis('image'); axis off;
23 | annIds = cocoGt.getAnnIds('imgIds',imgId); title('ground truth')
24 | anns = cocoGt.loadAnns(annIds); cocoGt.showAnns(anns);
25 | figure(1); subplot(1,2,2); imagesc(I); axis('image'); axis off;
26 | annIds = cocoDt.getAnnIds('imgIds',imgId); title('results')
27 | anns = cocoDt.loadAnns(annIds); cocoDt.showAnns(anns);
28 |
29 | %% load raw JSON and show exact format for results
30 | fprintf('results structure have the following format:\n');
31 | res = gason(fileread(resFile)); disp(res)
32 |
33 | %% the following command can be used to save the results back to disk
34 | if(0), f=fopen(resFile,'w'); fwrite(f,gason(res)); fclose(f); end
35 |
36 | %% run COCO evaluation code (see CocoEval.m)
37 | cocoEval=CocoEval(cocoGt,cocoDt,type);
38 | cocoEval.params.imgIds=imgIds;
39 | cocoEval.evaluate();
40 | cocoEval.accumulate();
41 | cocoEval.summarize();
42 |
43 | %% generate Derek Hoiem style analyis of false positives (slow)
44 | if(0), cocoEval.analyze(); end
45 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/gason.m:
--------------------------------------------------------------------------------
1 | function out = gason( in )
2 | % Convert between JSON strings and corresponding JSON objects.
3 | %
4 | % This parser is based on Gason written and maintained by Ivan Vashchaev:
5 | % https://github.com/vivkin/gason
6 | % Gason is a "lightweight and fast JSON parser for C++". Please see the
7 | % above link for license information and additional details about Gason.
8 | %
9 | % Given a JSON string, gason calls the C++ parser and converts the output
10 | % into an appropriate Matlab structure. As the parsing is performed in mex
11 | % the resulting parser is blazingly fast. Large JSON structs (100MB+) take
12 | % only a few seconds to parse (compared to hours for pure Matlab parsers).
13 | %
14 | % Given a JSON object, gason calls the C++ encoder to convert the object
15 | % back into a JSON string representation. Nearly any Matlab struct, cell
16 | % array, or numeric array represent a valid JSON object. Note that gason()
17 | % can be used to go both from JSON string to JSON object and back.
18 | %
19 | % Gason requires C++11 to compile (for GCC this requires version 4.7 or
20 | % later). The following command compiles the parser (may require tweaking):
21 | % mex('CXXFLAGS=\$CXXFLAGS -std=c++11 -Wall','-largeArrayDims',...
22 | % 'private/gasonMex.cpp','../common/gason.cpp',...
23 | % '-I../common/','-outdir','private');
24 | % Note the use of the "-std=c++11" flag. A number of precompiled binaries
25 | % are included, please do not contact us for help with compiling. If needed
26 | % you can specify a compiler by adding the option 'CXX="/usr/bin/g++"'.
27 | %
28 | % Note that by default JSON arrays that contain only numbers are stored as
29 | % regular Matlab arrays. Likewise, JSON arrays that contain only objects of
30 | % the same type are stored as Matlab struct arrays. This is much faster and
31 | % can use considerably less memory than always using Matlab cell arrays.
32 | %
33 | % USAGE
34 | % object = gason( string )
35 | % string = gason( object )
36 | %
37 | % INPUTS/OUTPUTS
38 | % string - JSON string
39 | % object - JSON object
40 | %
41 | % EXAMPLE
42 | % o = struct('first',{'piotr','ty'},'last',{'dollar','lin'})
43 | % s = gason( o ) % convert JSON object -> JSON string
44 | % p = gason( s ) % convert JSON string -> JSON object
45 | %
46 | % See also
47 | %
48 | % Microsoft COCO Toolbox. version 2.0
49 | % Data, paper, and tutorials available at: http://mscoco.org/
50 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
51 | % Licensed under the Simplified BSD License [see coco/license.txt]
52 |
53 | out = gasonMex( 'convert', in );
54 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/private/gasonMex.mexa64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/cocoapi_hq/MatlabAPI/private/gasonMex.mexa64
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/private/gasonMex.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/cocoapi_hq/MatlabAPI/private/gasonMex.mexmaci64
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/private/getPrmDflt.m:
--------------------------------------------------------------------------------
1 | function varargout = getPrmDflt( prm, dfs, checkExtra )
2 | % Helper to set default values (if not already set) of parameter struct.
3 | %
4 | % Takes input parameters and a list of 'name'/default pairs, and for each
5 | % 'name' for which prm has no value (prm.(name) is not a field or 'name'
6 | % does not appear in prm list), getPrmDflt assigns the given default
7 | % value. If default value for variable 'name' is 'REQ', and value for
8 | % 'name' is not given, an error is thrown. See below for usage details.
9 | %
10 | % USAGE (nargout==1)
11 | % prm = getPrmDflt( prm, dfs, [checkExtra] )
12 | %
13 | % USAGE (nargout>1)
14 | % [ param1 ... paramN ] = getPrmDflt( prm, dfs, [checkExtra] )
15 | %
16 | % INPUTS
17 | % prm - param struct or cell of form {'name1' v1 'name2' v2 ...}
18 | % dfs - cell of form {'name1' def1 'name2' def2 ...}
19 | % checkExtra - [0] if 1 throw error if prm contains params not in dfs
20 | % if -1 if prm contains params not in dfs adds them
21 | %
22 | % OUTPUTS (nargout==1)
23 | % prm - parameter struct with fields 'name1' through 'nameN' assigned
24 | %
25 | % OUTPUTS (nargout>1)
26 | % param1 - value assigned to parameter with 'name1'
27 | % ...
28 | % paramN - value assigned to parameter with 'nameN'
29 | %
30 | % EXAMPLE
31 | % dfs = { 'x','REQ', 'y',0, 'z',[], 'eps',1e-3 };
32 | % prm = getPrmDflt( struct('x',1,'y',1), dfs )
33 | % [ x y z eps ] = getPrmDflt( {'x',2,'y',1}, dfs )
34 | %
35 | % See also INPUTPARSER
36 | %
37 | % Piotr's Computer Vision Matlab Toolbox Version 2.60
38 | % Copyright 2014 Piotr Dollar. [pdollar-at-gmail.com]
39 | % Licensed under the Simplified BSD License [see external/bsd.txt]
40 |
41 | if( mod(length(dfs),2) ), error('odd number of default parameters'); end
42 | if nargin<=2, checkExtra = 0; end
43 |
44 | % get the input parameters as two cell arrays: prmVal and prmField
45 | if iscell(prm) && length(prm)==1, prm=prm{1}; end
46 | if iscell(prm)
47 | if(mod(length(prm),2)), error('odd number of parameters in prm'); end
48 | prmField = prm(1:2:end); prmVal = prm(2:2:end);
49 | else
50 | if(~isstruct(prm)), error('prm must be a struct or a cell'); end
51 | prmVal = struct2cell(prm); prmField = fieldnames(prm);
52 | end
53 |
54 | % get and update default values using quick for loop
55 | dfsField = dfs(1:2:end); dfsVal = dfs(2:2:end);
56 | if checkExtra>0
57 | for i=1:length(prmField)
58 | j = find(strcmp(prmField{i},dfsField));
59 | if isempty(j), error('parameter %s is not valid', prmField{i}); end
60 | dfsVal(j) = prmVal(i);
61 | end
62 | elseif checkExtra<0
63 | for i=1:length(prmField)
64 | j = find(strcmp(prmField{i},dfsField));
65 | if isempty(j), j=length(dfsVal)+1; dfsField{j}=prmField{i}; end
66 | dfsVal(j) = prmVal(i);
67 | end
68 | else
69 | for i=1:length(prmField)
70 | dfsVal(strcmp(prmField{i},dfsField)) = prmVal(i);
71 | end
72 | end
73 |
74 | % check for missing values
75 | if any(strcmp('REQ',dfsVal))
76 | cmpArray = find(strcmp('REQ',dfsVal));
77 | error(['Required field ''' dfsField{cmpArray(1)} ''' not specified.'] );
78 | end
79 |
80 | % set output
81 | if nargout==1
82 | varargout{1} = cell2struct( dfsVal, dfsField, 2 );
83 | else
84 | varargout = dfsVal;
85 | end
86 |
--------------------------------------------------------------------------------
/cocoapi_hq/MatlabAPI/private/maskApiMex.c:
--------------------------------------------------------------------------------
1 | /**************************************************************************
2 | * Microsoft COCO Toolbox. version 2.0
3 | * Data, paper, and tutorials available at: http://mscoco.org/
4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
5 | * Licensed under the Simplified BSD License [see coco/license.txt]
6 | **************************************************************************/
7 | #include "mex.h"
8 | #include "maskApi.h"
9 | #include
10 |
11 | void checkType( const mxArray *M, mxClassID id ) {
12 | if(mxGetClassID(M)!=id) mexErrMsgTxt("Invalid type.");
13 | }
14 |
15 | mxArray* toMxArray( const RLE *R, siz n ) {
16 | const char *fs[] = {"size", "counts"};
17 | mxArray *M=mxCreateStructMatrix(1,n,2,fs);
18 | for( siz i=0; i1) mexErrMsgTxt(err);
35 | for( i=0; i<*n; i++ ) {
36 | mxArray *S, *C; double *s; void *c;
37 | S=mxGetFieldByNumber(M,i,O[0]); checkType(S,mxDOUBLE_CLASS);
38 | C=mxGetFieldByNumber(M,i,O[1]); s=mxGetPr(S); c=mxGetData(C);
39 | h=(siz)s[0]; w=(siz)s[1]; m=mxGetNumberOfElements(C);
40 | if(same && i>0 && (h!=R[0].h || w!=R[0].w)) mexErrMsgTxt(err);
41 | if( mxGetClassID(C)==mxDOUBLE_CLASS ) {
42 | rleInit(R+i,h,w,m,0);
43 | for(j=0; j=2) ? (mxGetScalar(pr[1])>0) : false;
74 | rleMerge(R,&M,n,intersect); pl[0]=toMxArray(&M,1); rleFree(&M);
75 |
76 | } else if(!strcmp(action,"area")) {
77 | R=frMxArray(pr[0],&n,0);
78 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL);
79 | uint *a=(uint*) mxGetPr(pl[0]); rleArea(R,n,a);
80 |
81 | } else if(!strcmp(action,"iou")) {
82 | if(nr>2) checkType(pr[2],mxUINT8_CLASS); siz nDt, nGt;
83 | byte *iscrowd = nr>2 ? (byte*) mxGetPr(pr[2]) : NULL;
84 | if(mxIsStruct(pr[0]) || mxIsStruct(pr[1])) {
85 | RLE *dt=frMxArray(pr[0],&nDt,1), *gt=frMxArray(pr[1],&nGt,1);
86 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL);
87 | double *o=mxGetPr(pl[0]); rleIou(dt,gt,nDt,nGt,iscrowd,o);
88 | rlesFree(&dt,nDt); rlesFree(>,nGt);
89 | } else {
90 | checkType(pr[0],mxDOUBLE_CLASS); checkType(pr[1],mxDOUBLE_CLASS);
91 | double *dt=mxGetPr(pr[0]); nDt=mxGetN(pr[0]);
92 | double *gt=mxGetPr(pr[1]); nGt=mxGetN(pr[1]);
93 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL);
94 | double *o=mxGetPr(pl[0]); bbIou(dt,gt,nDt,nGt,iscrowd,o);
95 | }
96 |
97 | } else if(!strcmp(action,"nms")) {
98 | siz n; uint *keep; double thr=(double) mxGetScalar(pr[1]);
99 | if(mxIsStruct(pr[0])) {
100 | RLE *dt=frMxArray(pr[0],&n,1);
101 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL);
102 | keep=(uint*) mxGetPr(pl[0]); rleNms(dt,n,keep,thr);
103 | rlesFree(&dt,n);
104 | } else {
105 | checkType(pr[0],mxDOUBLE_CLASS);
106 | double *dt=mxGetPr(pr[0]); n=mxGetN(pr[0]);
107 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL);
108 | keep=(uint*) mxGetPr(pl[0]); bbNms(dt,n,keep,thr);
109 | }
110 |
111 | } else if(!strcmp(action,"toBbox")) {
112 | R=frMxArray(pr[0],&n,0);
113 | pl[0]=mxCreateNumericMatrix(4,n,mxDOUBLE_CLASS,mxREAL);
114 | BB bb=mxGetPr(pl[0]); rleToBbox(R,bb,n);
115 |
116 | } else if(!strcmp(action,"frBbox")) {
117 | checkType(pr[0],mxDOUBLE_CLASS);
118 | double *bb=mxGetPr(pr[0]); n=mxGetN(pr[0]);
119 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]);
120 | rlesInit(&R,n); rleFrBbox(R,bb,h,w,n); pl[0]=toMxArray(R,n);
121 |
122 | } else if(!strcmp(action,"frPoly")) {
123 | checkType(pr[0],mxCELL_CLASS); n=mxGetNumberOfElements(pr[0]);
124 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]); rlesInit(&R,n);
125 | for(siz i=0; i=18.0
2 | cython>=0.27.3
3 | matplotlib>=2.1.0
4 |
--------------------------------------------------------------------------------
/cocoapi_hq/PythonAPI/pycocotools.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 |
--------------------------------------------------------------------------------
/cocoapi_hq/PythonAPI/pycocotools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/cocoapi_hq/PythonAPI/pycocotools/boundary_utils.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import multiprocessing
3 | import math
4 |
5 | import cv2
6 | import numpy as np
7 |
8 | import pycocotools.mask as mask_utils
9 |
10 |
11 | # General util function to get the boundary of a binary mask.
12 | def mask_to_boundary(mask, dilation_ratio=0.02):
13 | """
14 | Convert binary mask to boundary mask.
15 | :param mask (numpy array, uint8): binary mask
16 | :param dilation_ratio (float): ratio to calculate dilation = dilation_ratio * image_diagonal
17 | :return: boundary mask (numpy array)
18 | """
19 | h, w = mask.shape
20 | img_diag = np.sqrt(h ** 2 + w ** 2)
21 | dilation = int(round(dilation_ratio * img_diag))
22 | if dilation < 1:
23 | dilation = 1
24 | # Pad image so mask truncated by the image border is also considered as boundary.
25 | new_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
26 | kernel = np.ones((3, 3), dtype=np.uint8)
27 | new_mask_erode = cv2.erode(new_mask, kernel, iterations=dilation)
28 | mask_erode = new_mask_erode[1 : h + 1, 1 : w + 1]
29 | # G_d intersects G in the paper.
30 | return mask - mask_erode
31 |
32 |
33 | # COCO/LVIS related util functions, to get the boundary for every annotations.
34 | def augment_annotations_with_boundary_single_core(proc_id, annotations, ann_to_mask, dilation_ratio=0.02):
35 | new_annotations = []
36 |
37 | for ann in annotations:
38 | mask_list = ann_to_mask(ann)
39 | # Find mask boundary.
40 | bound_list = []
41 | for mask in mask_list:
42 | boundary = mask_to_boundary(mask, dilation_ratio)
43 | bound_list.append(mask_utils.encode(np.array(boundary[:, :, None], order="F", dtype="uint8"))[0])
44 |
45 | # Add boundary to annotation in RLE format.
46 | ann['boundary_list'] = bound_list
47 | # print('ann keys:', ann.keys())
48 | new_annotations.append(ann)
49 |
50 | return new_annotations
51 |
52 |
53 | def augment_annotations_with_boundary_multi_core(annotations, ann_to_mask, dilation_ratio=0.02):
54 | cpu_num = multiprocessing.cpu_count()
55 | annotations_split = np.array_split(annotations, cpu_num)
56 | print("Number of cores: {}, annotations per core: {}".format(cpu_num, len(annotations_split[0])))
57 | workers = multiprocessing.Pool(processes=cpu_num)
58 | processes = []
59 |
60 | for proc_id, annotation_set in enumerate(annotations_split):
61 | p = workers.apply_async(augment_annotations_with_boundary_single_core,
62 | (proc_id, annotation_set, ann_to_mask, dilation_ratio))
63 | processes.append(p)
64 |
65 | new_annotations = []
66 | for p in processes:
67 | new_annotations.extend(p.get())
68 |
69 | workers.close()
70 | workers.join()
71 |
72 | return new_annotations
--------------------------------------------------------------------------------
/cocoapi_hq/PythonAPI/pycocotools/mask.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tsungyi'
2 |
3 | import pycocotools._mask as _mask
4 |
5 | # Interface for manipulating masks stored in RLE format.
6 | #
7 | # RLE is a simple yet efficient format for storing binary masks. RLE
8 | # first divides a vector (or vectorized image) into a series of piecewise
9 | # constant regions and then for each piece simply stores the length of
10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
12 | # (note that the odd counts are always the numbers of zeros). Instead of
13 | # storing the counts directly, additional compression is achieved with a
14 | # variable bitrate representation based on a common scheme called LEB128.
15 | #
16 | # Compression is greatest given large piecewise constant regions.
17 | # Specifically, the size of the RLE is proportional to the number of
18 | # *boundaries* in M (or for an image the number of boundaries in the y
19 | # direction). Assuming fairly simple shapes, the RLE representation is
20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage
21 | # is substantially lower, especially for large simple objects (large n).
22 | #
23 | # Many common operations on masks can be computed directly using the RLE
24 | # (without need for decoding). This includes computations such as area,
25 | # union, intersection, etc. All of these operations are linear in the
26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area
27 | # of the object. Computing these operations on the original mask is O(n).
28 | # Thus, using the RLE can result in substantial computational savings.
29 | #
30 | # The following API functions are defined:
31 | # encode - Encode binary masks using RLE.
32 | # decode - Decode binary masks encoded via RLE.
33 | # merge - Compute union or intersection of encoded masks.
34 | # iou - Compute intersection over union between masks.
35 | # area - Compute area of encoded masks.
36 | # toBbox - Get bounding boxes surrounding encoded masks.
37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
38 | #
39 | # Usage:
40 | # Rs = encode( masks )
41 | # masks = decode( Rs )
42 | # R = merge( Rs, intersect=false )
43 | # o = iou( dt, gt, iscrowd )
44 | # a = area( Rs )
45 | # bbs = toBbox( Rs )
46 | # Rs = frPyObjects( [pyObjects], h, w )
47 | #
48 | # In the API the following formats are used:
49 | # Rs - [dict] Run-length encoding of binary masks
50 | # R - dict Run-length encoding of binary mask
51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
53 | # bbs - [nx4] Bounding box(es) stored as [x y w h]
54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
55 | # dt,gt - May be either bounding boxes or encoded masks
56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
57 | #
58 | # Finally, a note about the intersection over union (iou) computation.
59 | # The standard iou of a ground truth (gt) and detected (dt) object is
60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
61 | # For "crowd" regions, we use a modified criteria. If a gt object is
62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt.
63 | # Choosing gt' in the crowd gt that best matches the dt can be done using
64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
66 | # For crowd gt regions we use this modified criteria above for the iou.
67 | #
68 | # To compile run "python setup.py build_ext --inplace"
69 | # Please do not contact us for help with compiling.
70 | #
71 | # Microsoft COCO Toolbox. version 2.0
72 | # Data, paper, and tutorials available at: http://mscoco.org/
73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
74 | # Licensed under the Simplified BSD License [see coco/license.txt]
75 |
76 | iou = _mask.iou
77 | merge = _mask.merge
78 | frPyObjects = _mask.frPyObjects
79 |
80 | def encode(bimask):
81 | if len(bimask.shape) == 3:
82 | return _mask.encode(bimask)
83 | elif len(bimask.shape) == 2:
84 | h, w = bimask.shape
85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
86 |
87 | def decode(rleObjs):
88 | if type(rleObjs) == list:
89 | return _mask.decode(rleObjs)
90 | else:
91 | return _mask.decode([rleObjs])[:,:,0]
92 |
93 | def area(rleObjs):
94 | if type(rleObjs) == list:
95 | return _mask.area(rleObjs)
96 | else:
97 | return _mask.area([rleObjs])[0]
98 |
99 | def toBbox(rleObjs):
100 | if type(rleObjs) == list:
101 | return _mask.toBbox(rleObjs)
102 | else:
103 | return _mask.toBbox([rleObjs])[0]
--------------------------------------------------------------------------------
/cocoapi_hq/PythonAPI/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension
2 | import numpy as np
3 |
4 | # To compile and install locally run "python setup.py build_ext --inplace"
5 | # To install library to Python site-packages run "python setup.py build_ext install"
6 |
7 | ext_modules = [
8 | Extension(
9 | 'pycocotools._mask',
10 | sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'],
11 | include_dirs = [np.get_include(), '../common'],
12 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
13 | )
14 | ]
15 |
16 | setup(
17 | name='pycocotools',
18 | packages=['pycocotools'],
19 | package_dir = {'pycocotools': 'pycocotools'},
20 | install_requires=[
21 | 'setuptools>=18.0',
22 | 'cython>=0.27.3',
23 | 'matplotlib>=2.1.0'
24 | ],
25 | version='2.0',
26 | ext_modules= ext_modules
27 | )
28 |
--------------------------------------------------------------------------------
/cocoapi_hq/README.md:
--------------------------------------------------------------------------------
1 | # HQ-YTVIS data loading and evaluation
2 |
3 | It support both the tube-boundary AP evalution proposed in Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022], and also the traditional tube-mask AP evaluation.
4 | ## Introduction
5 |
6 | This package provides data loading and evaluation functionalities for high-quality video instance segmentation on HQ-YTVIS. It is built based on [youtubevos API](https://github.com/youtubevos/cocoapi/) designed for the Youtube VOS dataset (https://youtube-vos.org/dataset/vis/). For evaluation metrics, pleae refer to the Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022].
7 |
8 | We have only implemented Python API for HQ-YTVIS.
9 |
10 | ## Installation
11 | To install:
12 | ```
13 | cd PythonAPI
14 | # To compile and install locally
15 | python setup.py build_ext --inplace
16 | # To install library to Python site-packages
17 | python setup.py build_ext install
18 | ```
19 |
20 | ## Contact
21 | If you have any questions regarding the repo, please create an issue.
22 |
--------------------------------------------------------------------------------
/cocoapi_hq/README.txt:
--------------------------------------------------------------------------------
1 | COCO API - http://cocodataset.org/
2 |
3 | COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation. This package provides Matlab, Python, and Lua APIs that assists in loading, parsing, and visualizing the annotations in COCO. Please visit http://cocodataset.org/ for more information on COCO, including for the data, paper, and tutorials. The exact format of the annotations is also described on the COCO website. The Matlab and Python APIs are complete, the Lua API provides only basic functionality.
4 |
5 | In addition to this API, please download both the COCO images and annotations in order to run the demos and use the API. Both are available on the project website.
6 | -Please download, unzip, and place the images in: coco/images/
7 | -Please download and place the annotations in: coco/annotations/
8 | For substantially more details on the API please see http://cocodataset.org/#download.
9 |
10 | After downloading the images and annotations, run the Matlab, Python, or Lua demos for example usage.
11 |
12 | To install:
13 | -For Matlab, add coco/MatlabApi to the Matlab path (OSX/Linux binaries provided)
14 | -For Python, run "make" under coco/PythonAPI
15 | -For Lua, run “luarocks make LuaAPI/rocks/coco-scm-1.rockspec” under coco/
16 |
--------------------------------------------------------------------------------
/cocoapi_hq/common/gason.h:
--------------------------------------------------------------------------------
1 | // https://github.com/vivkin/gason - pulled January 10, 2016
2 | #pragma once
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | enum JsonTag {
9 | JSON_NUMBER = 0,
10 | JSON_STRING,
11 | JSON_ARRAY,
12 | JSON_OBJECT,
13 | JSON_TRUE,
14 | JSON_FALSE,
15 | JSON_NULL = 0xF
16 | };
17 |
18 | struct JsonNode;
19 |
20 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL
21 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL
22 | #define JSON_VALUE_TAG_MASK 0xF
23 | #define JSON_VALUE_TAG_SHIFT 47
24 |
25 | union JsonValue {
26 | uint64_t ival;
27 | double fval;
28 |
29 | JsonValue(double x)
30 | : fval(x) {
31 | }
32 | JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) {
33 | assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK);
34 | ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload;
35 | }
36 | bool isDouble() const {
37 | return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK;
38 | }
39 | JsonTag getTag() const {
40 | return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK);
41 | }
42 | uint64_t getPayload() const {
43 | assert(!isDouble());
44 | return ival & JSON_VALUE_PAYLOAD_MASK;
45 | }
46 | double toNumber() const {
47 | assert(getTag() == JSON_NUMBER);
48 | return fval;
49 | }
50 | char *toString() const {
51 | assert(getTag() == JSON_STRING);
52 | return (char *)getPayload();
53 | }
54 | JsonNode *toNode() const {
55 | assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT);
56 | return (JsonNode *)getPayload();
57 | }
58 | };
59 |
60 | struct JsonNode {
61 | JsonValue value;
62 | JsonNode *next;
63 | char *key;
64 | };
65 |
66 | struct JsonIterator {
67 | JsonNode *p;
68 |
69 | void operator++() {
70 | p = p->next;
71 | }
72 | bool operator!=(const JsonIterator &x) const {
73 | return p != x.p;
74 | }
75 | JsonNode *operator*() const {
76 | return p;
77 | }
78 | JsonNode *operator->() const {
79 | return p;
80 | }
81 | };
82 |
83 | inline JsonIterator begin(JsonValue o) {
84 | return JsonIterator{o.toNode()};
85 | }
86 | inline JsonIterator end(JsonValue) {
87 | return JsonIterator{nullptr};
88 | }
89 |
90 | #define JSON_ERRNO_MAP(XX) \
91 | XX(OK, "ok") \
92 | XX(BAD_NUMBER, "bad number") \
93 | XX(BAD_STRING, "bad string") \
94 | XX(BAD_IDENTIFIER, "bad identifier") \
95 | XX(STACK_OVERFLOW, "stack overflow") \
96 | XX(STACK_UNDERFLOW, "stack underflow") \
97 | XX(MISMATCH_BRACKET, "mismatch bracket") \
98 | XX(UNEXPECTED_CHARACTER, "unexpected character") \
99 | XX(UNQUOTED_KEY, "unquoted key") \
100 | XX(BREAKING_BAD, "breaking bad") \
101 | XX(ALLOCATION_FAILURE, "allocation failure")
102 |
103 | enum JsonErrno {
104 | #define XX(no, str) JSON_##no,
105 | JSON_ERRNO_MAP(XX)
106 | #undef XX
107 | };
108 |
109 | const char *jsonStrError(int err);
110 |
111 | class JsonAllocator {
112 | struct Zone {
113 | Zone *next;
114 | size_t used;
115 | } *head = nullptr;
116 |
117 | public:
118 | JsonAllocator() = default;
119 | JsonAllocator(const JsonAllocator &) = delete;
120 | JsonAllocator &operator=(const JsonAllocator &) = delete;
121 | JsonAllocator(JsonAllocator &&x) : head(x.head) {
122 | x.head = nullptr;
123 | }
124 | JsonAllocator &operator=(JsonAllocator &&x) {
125 | head = x.head;
126 | x.head = nullptr;
127 | return *this;
128 | }
129 | ~JsonAllocator() {
130 | deallocate();
131 | }
132 | void *allocate(size_t size);
133 | void deallocate();
134 | };
135 |
136 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator);
137 |
--------------------------------------------------------------------------------
/cocoapi_hq/common/maskApi.c:
--------------------------------------------------------------------------------
1 | /**************************************************************************
2 | * Microsoft COCO Toolbox. version 2.0
3 | * Data, paper, and tutorials available at: http://mscoco.org/
4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
5 | * Licensed under the Simplified BSD License [see coco/license.txt]
6 | **************************************************************************/
7 | #include "maskApi.h"
8 | #include
9 | #include
10 |
11 | uint umin( uint a, uint b ) { return (ab) ? a : b; }
13 |
14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j];
17 | }
18 |
19 | void rleFree( RLE *R ) {
20 | free(R->cnts); R->cnts=0;
21 | }
22 |
23 | void rlesInit( RLE **R, siz n ) {
24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
25 | for(i=0; i0 ) {
61 | c=umin(ca,cb); cc+=c; ct=0;
62 | ca-=c; if(!ca && a0) {
83 | crowd=iscrowd!=NULL && iscrowd[g];
84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
88 | while( ct>0 ) {
89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
90 | ca-=c; if(!ca && athr) keep[j]=0;
105 | }
106 | }
107 | }
108 |
109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
110 | double h, w, i, u, ga, da; siz g, d; int crowd;
111 | for( g=0; gthr) keep[j]=0;
129 | }
130 | }
131 | }
132 |
133 | void rleToBbox( const RLE *R, BB bb, siz n ) {
134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye);
174 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
175 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
176 | if(dx>=dy) for( d=0; d<=dx; d++ ) {
177 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
178 | } else for( d=0; d<=dy; d++ ) {
179 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
180 | }
181 | }
182 | /* get points along y-boundary and downsample */
183 | free(x); free(y); k=m; m=0; double xd, yd;
184 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
185 | for( j=1; jw-1 ) continue;
188 | yd=(double)(v[j]h) yd=h; yd=ceil(yd);
190 | x[m]=(int) xd; y[m]=(int) yd; m++;
191 | }
192 | /* compute rle encoding given y-boundary points */
193 | k=m; a=malloc(sizeof(uint)*(k+1));
194 | for( j=0; j0) b[m++]=a[j++]; else {
200 | j++; if(jm, p=0; long x; int more;
207 | char *s=malloc(sizeof(char)*m*6);
208 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
210 | while( more ) {
211 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
212 | if(more) c |= 0x20; c+=48; s[p++]=c;
213 | }
214 | }
215 | s[p]=0; return s;
216 | }
217 |
218 | void rleFrString( RLE *R, char *s, siz h, siz w ) {
219 | siz m=0, p=0, k; long x; int more; uint *cnts;
220 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
221 | while( s[p] ) {
222 | x=0; k=0; more=1;
223 | while( more ) {
224 | char c=s[p]-48; x |= (c & 0x1f) << 5*k;
225 | more = c & 0x20; p++; k++;
226 | if(!more && (c & 0x10)) x |= -1 << 5*k;
227 | }
228 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
229 | }
230 | rleInit(R,h,w,m,cnts); free(cnts);
231 | }
232 |
--------------------------------------------------------------------------------
/cocoapi_hq/common/maskApi.h:
--------------------------------------------------------------------------------
1 | /**************************************************************************
2 | * Microsoft COCO Toolbox. version 2.0
3 | * Data, paper, and tutorials available at: http://mscoco.org/
4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
5 | * Licensed under the Simplified BSD License [see coco/license.txt]
6 | **************************************************************************/
7 | #pragma once
8 |
9 | typedef unsigned int uint;
10 | typedef unsigned long siz;
11 | typedef unsigned char byte;
12 | typedef double* BB;
13 | typedef struct { siz h, w, m; uint *cnts; } RLE;
14 |
15 | /* Initialize/destroy RLE. */
16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
17 | void rleFree( RLE *R );
18 |
19 | /* Initialize/destroy RLE array. */
20 | void rlesInit( RLE **R, siz n );
21 | void rlesFree( RLE **R, siz n );
22 |
23 | /* Encode binary masks using RLE. */
24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
25 |
26 | /* Decode binary masks encoded via RLE. */
27 | void rleDecode( const RLE *R, byte *mask, siz n );
28 |
29 | /* Compute union or intersection of encoded masks. */
30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
31 |
32 | /* Compute area of encoded masks. */
33 | void rleArea( const RLE *R, siz n, uint *a );
34 |
35 | /* Compute intersection over union between masks. */
36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
37 |
38 | /* Compute non-maximum suppression between bounding masks */
39 | void rleNms( RLE *dt, siz n, uint *keep, double thr );
40 |
41 | /* Compute intersection over union between bounding boxes. */
42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
43 |
44 | /* Compute non-maximum suppression between bounding boxes */
45 | void bbNms( BB dt, siz n, uint *keep, double thr );
46 |
47 | /* Get bounding boxes surrounding encoded masks. */
48 | void rleToBbox( const RLE *R, BB bb, siz n );
49 |
50 | /* Convert bounding boxes to encoded masks. */
51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
52 |
53 | /* Convert polygon to encoded mask. */
54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
55 |
56 | /* Get compressed string representation of encoded mask. */
57 | char* rleToString( const RLE *R );
58 |
59 | /* Convert from compressed string representation of encoded mask. */
60 | void rleFrString( RLE *R, char *s, siz h, siz w );
61 |
--------------------------------------------------------------------------------
/cocoapi_hq/license.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 |
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.
27 |
--------------------------------------------------------------------------------
/cocoapi_hq/results/val2014_fake_eval_res.txt:
--------------------------------------------------------------------------------
1 | ------------------------------------------------------------------------------
2 | type=segm
3 | Running per image evaluation... DONE (t=0.45s).
4 | Accumulating evaluation results... DONE (t=0.08s).
5 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.320
6 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.562
7 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.299
8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.387
9 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.310
10 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327
11 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.268
12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.415
13 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.417
14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.469
15 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377
16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.381
17 |
18 | ------------------------------------------------------------------------------
19 | type=bbox
20 | Running per image evaluation... DONE (t=0.34s).
21 | Accumulating evaluation results... DONE (t=0.08s).
22 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.505
23 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.697
24 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.573
25 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.586
26 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.519
27 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501
28 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.387
29 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.594
30 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.595
31 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.640
32 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.566
33 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.564
34 |
35 | ------------------------------------------------------------------------------
36 | type=keypoints
37 | Running per image evaluation... DONE (t=0.06s).
38 | Accumulating evaluation results... DONE (t=0.00s).
39 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.372
40 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.636
41 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.348
42 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.384
43 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.386
44 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.514
45 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.734
46 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.504
47 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.508
48 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.522
49 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | import torch.utils.data
2 | from .torchvision_datasets import CocoDetection
3 | from datasets.ytvos import YTVOSDataset as YTVOSDataset
4 |
5 | from .coco import build as build_coco
6 | from .coco2seq import build as build_seq_coco
7 | from .concat_dataset import build as build_joint
8 | from .ytvos import build as build_ytvs
9 |
10 |
11 |
12 | def get_coco_api_from_dataset(dataset):
13 | for _ in range(10):
14 | if isinstance(dataset, torch.utils.data.Subset):
15 | dataset = dataset.dataset
16 | if isinstance(dataset, CocoDetection):
17 | return dataset.coco
18 | if isinstance(dataset, YTVOSDataset):
19 | return dataset.ytvos
20 |
21 |
22 | ### build_type only works for YoutubeVIS ###
23 | def build_dataset(image_set, args):
24 | if args.dataset_file == 'YoutubeVIS':
25 | return build_ytvs(image_set, args)
26 |
27 | if args.dataset_file == 'coco':
28 | return build_coco(image_set, args)
29 | if args.dataset_file == 'Seq_coco':
30 | return build_seq_coco(image_set, args)
31 | if args.dataset_file == 'jointcoco':
32 | return build_joint(image_set, args)
33 |
34 |
35 | raise ValueError(f'dataset {args.dataset_file} not supported')
36 |
37 |
38 |
--------------------------------------------------------------------------------
/datasets/coco.py:
--------------------------------------------------------------------------------
1 | """
2 | COCO dataset which returns image_id for evaluation.
3 |
4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
5 | """
6 | from pathlib import Path
7 |
8 | import torch
9 | import torch.utils.data
10 | from pycocotools import mask as coco_mask
11 |
12 | from .torchvision_datasets import CocoDetection as TvCocoDetection
13 | from util.misc import get_local_rank, get_local_size
14 | import datasets.transforms as T
15 | import random
16 |
17 |
18 | class CocoDetection(TvCocoDetection):
19 | def __init__(self, img_folder, ann_file, transforms, return_masks, cache_mode=False, local_rank=0, local_size=1):
20 | super(CocoDetection, self).__init__(img_folder, ann_file,
21 | cache_mode=cache_mode, local_rank=local_rank, local_size=local_size)
22 | self._transforms = transforms
23 | self.prepare = ConvertCocoPolysToMask(return_masks)
24 |
25 | def __getitem__(self, idx):
26 |
27 | instance_check = False
28 | while not instance_check:
29 | img, target = super(CocoDetection, self).__getitem__(idx)
30 | image_id = self.ids[idx]
31 | target = {'image_id': image_id, 'annotations': target}
32 | img, target = self.prepare(img, target)
33 | if self._transforms is not None:
34 | img, target = self._transforms(img, target)
35 |
36 | if len(target['labels']) == 0: # None instance
37 | idx = random.randint(0,self.__len__()-1)
38 | else:
39 | instance_check=True
40 |
41 | return img, target
42 |
43 |
44 | def convert_coco_poly_to_mask(segmentations, height, width):
45 | masks = []
46 | for polygons in segmentations:
47 | rles = coco_mask.frPyObjects(polygons, height, width)
48 | mask = coco_mask.decode(rles)
49 | if len(mask.shape) < 3:
50 | mask = mask[..., None]
51 | mask = torch.as_tensor(mask, dtype=torch.uint8)
52 | mask = mask.any(dim=2)
53 | masks.append(mask)
54 | if masks:
55 | masks = torch.stack(masks, dim=0)
56 | else:
57 | masks = torch.zeros((0, height, width), dtype=torch.uint8)
58 | return masks
59 |
60 |
61 | class ConvertCocoPolysToMask(object):
62 | def __init__(self, return_masks=False):
63 | self.return_masks = return_masks
64 |
65 | def __call__(self, image, target):
66 | w, h = image.size
67 |
68 | image_id = target["image_id"]
69 | image_id = torch.tensor([image_id])
70 |
71 | anno = target["annotations"]
72 |
73 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
74 |
75 | boxes = [obj["bbox"] for obj in anno]
76 | # guard against no boxes via resizing
77 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
78 | boxes[:, 2:] += boxes[:, :2]
79 | boxes[:, 0::2].clamp_(min=0, max=w)
80 | boxes[:, 1::2].clamp_(min=0, max=h)
81 |
82 | classes = [obj["category_id"] for obj in anno]
83 | classes = torch.tensor(classes, dtype=torch.int64)
84 |
85 | if self.return_masks:
86 | segmentations = [obj["segmentation_refined"] for obj in anno]
87 | masks = convert_coco_poly_to_mask(segmentations, h, w)
88 |
89 | keypoints = None
90 | if anno and "keypoints" in anno[0]:
91 | keypoints = [obj["keypoints"] for obj in anno]
92 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
93 | num_keypoints = keypoints.shape[0]
94 | if num_keypoints:
95 | keypoints = keypoints.view(num_keypoints, -1, 3)
96 |
97 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
98 | boxes = boxes[keep]
99 | classes = classes[keep]
100 | if self.return_masks:
101 | masks = masks[keep]
102 | if keypoints is not None:
103 | keypoints = keypoints[keep]
104 |
105 | target = {}
106 | target["boxes"] = boxes
107 | target["labels"] = classes
108 | if self.return_masks:
109 | target["masks"] = masks
110 | target["image_id"] = image_id
111 | if keypoints is not None:
112 | target["keypoints"] = keypoints
113 |
114 | # for conversion to coco api
115 | area = torch.tensor([obj["area"] for obj in anno])
116 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
117 | target["area"] = area[keep]
118 | target["iscrowd"] = iscrowd[keep]
119 |
120 | target["orig_size"] = torch.as_tensor([int(h), int(w)])
121 | target["size"] = torch.as_tensor([int(h), int(w)])
122 |
123 | return image, target
124 |
125 |
126 | def make_coco_transforms(image_set):
127 |
128 | normalize = T.Compose([
129 | T.ToTensor(),
130 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
131 | ])
132 |
133 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
134 | # scales = [296, 328, 360, 392]
135 |
136 | if image_set == 'train':
137 | return T.Compose([
138 | T.RandomHorizontalFlip(),
139 | T.RandomSelect(
140 | T.RandomResize(scales, max_size=1333),
141 | T.Compose([
142 | T.RandomResize([400, 500, 600]),
143 | T.RandomSizeCrop(384, 600),
144 | T.RandomResize(scales, max_size=1333),
145 | ])
146 | ),
147 | normalize,
148 | ])
149 |
150 | if image_set == 'val':
151 | return T.Compose([
152 | T.RandomResize([800], max_size=1333),
153 | normalize,
154 | ])
155 |
156 | raise ValueError(f'unknown {image_set}')
157 |
158 |
159 | def build(image_set, args):
160 | root = Path(args.coco_path)
161 | assert root.exists(), f'provided COCO path {root} does not exist'
162 | mode = 'instances'
163 | dataset_type = args.dataset_type
164 | if args.dataset_file == 'coco':
165 | PATHS = {
166 | "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
167 | "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
168 | }
169 |
170 |
171 | img_folder, ann_file = PATHS[image_set]
172 | dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks,
173 | cache_mode=args.cache_mode, local_rank=get_local_rank(), local_size=get_local_size())
174 | return dataset
175 |
176 |
177 |
--------------------------------------------------------------------------------
/datasets/coco_panoptic.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 |
10 | import json
11 | from pathlib import Path
12 |
13 | import numpy as np
14 | import torch
15 | from PIL import Image
16 |
17 | from panopticapi.utils import rgb2id
18 | from util.box_ops import masks_to_boxes
19 |
20 | from .coco import make_coco_transforms
21 |
22 |
23 | class CocoPanoptic:
24 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
25 | with open(ann_file, 'r') as f:
26 | self.coco = json.load(f)
27 |
28 | # sort 'images' field so that they are aligned with 'annotations'
29 | # i.e., in alphabetical order
30 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
31 | # sanity check
32 | if "annotations" in self.coco:
33 | for img, ann in zip(self.coco['images'], self.coco['annotations']):
34 | assert img['file_name'][:-4] == ann['file_name'][:-4]
35 |
36 | self.img_folder = img_folder
37 | self.ann_folder = ann_folder
38 | self.ann_file = ann_file
39 | self.transforms = transforms
40 | self.return_masks = return_masks
41 |
42 | def __getitem__(self, idx):
43 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
44 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
45 | ann_path = Path(self.ann_folder) / ann_info['file_name']
46 |
47 | img = Image.open(img_path).convert('RGB')
48 | w, h = img.size
49 | if "segments_info" in ann_info:
50 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
51 | masks = rgb2id(masks)
52 |
53 | ids = np.array([ann['id'] for ann in ann_info['segments_info']])
54 | masks = masks == ids[:, None, None]
55 |
56 | masks = torch.as_tensor(masks, dtype=torch.uint8)
57 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
58 |
59 | target = {}
60 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
61 | if self.return_masks:
62 | target['masks'] = masks
63 | target['labels'] = labels
64 |
65 | target["boxes"] = masks_to_boxes(masks)
66 |
67 | target['size'] = torch.as_tensor([int(h), int(w)])
68 | target['orig_size'] = torch.as_tensor([int(h), int(w)])
69 | if "segments_info" in ann_info:
70 | for name in ['iscrowd', 'area']:
71 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
72 |
73 | if self.transforms is not None:
74 | img, target = self.transforms(img, target)
75 |
76 | return img, target
77 |
78 | def __len__(self):
79 | return len(self.coco['images'])
80 |
81 | def get_height_and_width(self, idx):
82 | img_info = self.coco['images'][idx]
83 | height = img_info['height']
84 | width = img_info['width']
85 | return height, width
86 |
87 |
88 | def build(image_set, args):
89 | img_folder_root = Path(args.coco_path)
90 | ann_folder_root = Path(args.coco_panoptic_path)
91 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
92 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
93 | mode = 'panoptic'
94 | PATHS = {
95 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
96 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
97 | }
98 |
99 | img_folder, ann_file = PATHS[image_set]
100 | img_folder_path = img_folder_root / img_folder
101 | ann_folder = ann_folder_root / f'{mode}_{img_folder}'
102 | ann_file = ann_folder_root / ann_file
103 |
104 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
105 | transforms=make_coco_transforms(image_set), return_masks=args.masks)
106 |
107 | return dataset
108 |
--------------------------------------------------------------------------------
/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import torch
4 | import torch.utils.data
5 |
6 | from util.misc import get_local_rank, get_local_size
7 | import datasets.transforms_clip as T
8 | from torch.utils.data import Dataset, ConcatDataset
9 | from .coco2seq import build as build_seq_coco
10 | from .ytvos import build as build_ytvs
11 |
12 |
13 |
14 | def build(image_set, args):
15 | print('preparing coco2seq dataset ....')
16 | coco_seq = build_seq_coco(image_set, args)
17 | print('preparing hq ytvis dataset .... ')
18 | ytvis_dataset = build_ytvs(image_set, args)
19 |
20 | concat_data = ConcatDataset([ytvis_dataset, coco_seq])
21 |
22 | return concat_data
23 |
24 |
25 |
--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 |
7 | import torch
8 |
9 | def to_cuda(samples, targets, device):
10 | samples = samples.to(device, non_blocking=True)
11 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
12 | # targets_n = []
13 | # for t in targets:
14 | # t_d = {}
15 | # for k, v in t.items():
16 | # if k != 'path':
17 | # t_d[k] = v.to(device, non_blocking=True)
18 | # else:
19 | # t_d[k] = v
20 | # #targets_n.append({k: v.to(device, non_blocking=True) for k, v in t.items()})
21 | # targets_n.append(t_d)
22 |
23 | return samples, targets
24 |
25 | class data_prefetcher():
26 | def __init__(self, loader, device, prefetch=True):
27 | self.loader = iter(loader)
28 | self.prefetch = prefetch
29 | self.device = device
30 | if prefetch:
31 | self.stream = torch.cuda.Stream()
32 | self.preload()
33 |
34 | def preload(self):
35 | try:
36 | self.next_samples, self.next_targets = next(self.loader)
37 | except StopIteration:
38 | self.next_samples = None
39 | self.next_targets = None
40 | return
41 | # if record_stream() doesn't work, another option is to make sure device inputs are created
42 | # on the main stream.
43 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
44 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
45 | # Need to make sure the memory allocated for next_* is not still in use by the main stream
46 | # at the time we start copying to next_*:
47 | # self.stream.wait_stream(torch.cuda.current_stream())
48 | with torch.cuda.stream(self.stream):
49 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
50 | # more code for the alternative if record_stream() doesn't work:
51 | # copy_ will record the use of the pinned source tensor in this side stream.
52 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
53 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
54 | # self.next_input = self.next_input_gpu
55 | # self.next_target = self.next_target_gpu
56 |
57 | # With Amp, it isn't necessary to manually convert data to half.
58 | # if args.fp16:
59 | # self.next_input = self.next_input.half()
60 | # else:
61 |
62 | def next(self):
63 | if self.prefetch:
64 | torch.cuda.current_stream().wait_stream(self.stream)
65 | samples = self.next_samples
66 | targets = self.next_targets
67 | if samples is not None:
68 | samples.record_stream(torch.cuda.current_stream())
69 | if targets is not None:
70 | for t in targets:
71 | for k, v in t.items():
72 | v.record_stream(torch.cuda.current_stream())
73 | self.preload()
74 | else:
75 | try:
76 | samples, targets = next(self.loader)
77 | samples, targets = to_cuda(samples, targets, self.device)
78 | except StopIteration:
79 | samples = None
80 | targets = None
81 | return samples, targets
82 |
--------------------------------------------------------------------------------
/datasets/image_to_seq_augmenter.py:
--------------------------------------------------------------------------------
1 | import imgaug
2 | import imgaug.augmenters as iaa
3 | import numpy as np
4 |
5 | from datetime import datetime
6 |
7 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage
8 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
9 |
10 |
11 | class ImageToSeqAugmenter(object):
12 | def __init__(self, perspective=True, affine=True, motion_blur=True,
13 | brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12,
14 | scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20),
15 | motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5):
16 |
17 | self.basic_augmenter = iaa.SomeOf((1, None), [
18 | iaa.Add(brightness_range),
19 | iaa.AddToHueAndSaturation(hue_saturation_range)
20 | ]
21 | )
22 |
23 | transforms = []
24 | if perspective:
25 | transforms.append(iaa.PerspectiveTransform(perspective_magnitude))
26 | if affine:
27 | transforms.append(iaa.Affine(scale=scale_range,
28 | translate_percent=translate_range,
29 | rotate=rotation_range,
30 | order=1, # cv2.INTER_LINEAR
31 | backend='auto'))
32 | transforms = iaa.Sequential(transforms)
33 | transforms = [transforms]
34 |
35 | if motion_blur:
36 | blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf(
37 | [
38 | iaa.MotionBlur(ksize)
39 | for ksize in motion_blur_kernel_sizes
40 | ]
41 | ))
42 | transforms.append(blur)
43 |
44 | self.frame_shift_augmenter = iaa.Sequential(transforms)
45 |
46 | @staticmethod
47 | def condense_masks(instance_masks):
48 | condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8)
49 | for instance_id, mask in enumerate(instance_masks, 1):
50 | condensed_mask = np.where(mask, instance_id, condensed_mask)
51 |
52 | return condensed_mask
53 |
54 | @staticmethod
55 | def expand_masks(condensed_mask, num_instances):
56 | return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)]
57 |
58 | def __call__(self, image, masks=None, boxes=None):
59 | det_augmenter = self.frame_shift_augmenter.to_deterministic()
60 |
61 |
62 | if masks is not None:
63 | masks_np, is_binary_mask = [], []
64 | boxs_np = []
65 |
66 | for mask in masks:
67 |
68 | if isinstance(mask, np.ndarray):
69 | masks_np.append(mask.astype(np.bool))
70 | is_binary_mask.append(False)
71 | else:
72 | raise ValueError("Invalid mask type: {}".format(type(mask)))
73 |
74 | num_instances = len(masks_np)
75 | masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2])
76 | # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2])
77 |
78 | seed = int(datetime.now().strftime('%M%S%f')[-8:])
79 | imgaug.seed(seed)
80 | aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np)
81 | imgaug.seed(seed)
82 | invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2)
83 | aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances)
84 | # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image()
85 | aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)]
86 | return aug_image, aug_masks #, aug_boxes.to_xyxy_array()
87 |
88 | else:
89 | masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])]
90 | aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks)
91 | return aug_image, invalid_pts_mask.get_arr() == 0
92 |
--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 |
10 | import json
11 | import os
12 |
13 | import util.misc as utils
14 |
15 | try:
16 | from panopticapi.evaluation import pq_compute
17 | except ImportError:
18 | pass
19 |
20 |
21 | class PanopticEvaluator(object):
22 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
23 | self.gt_json = ann_file
24 | self.gt_folder = ann_folder
25 | if utils.is_main_process():
26 | if not os.path.exists(output_dir):
27 | os.mkdir(output_dir)
28 | self.output_dir = output_dir
29 | self.predictions = []
30 |
31 | def update(self, predictions):
32 | for p in predictions:
33 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
34 | f.write(p.pop("png_string"))
35 |
36 | self.predictions += predictions
37 |
38 | def synchronize_between_processes(self):
39 | all_predictions = utils.all_gather(self.predictions)
40 | merged_predictions = []
41 | for p in all_predictions:
42 | merged_predictions += p
43 | self.predictions = merged_predictions
44 |
45 | def summarize(self):
46 | if utils.is_main_process():
47 | json_data = {"annotations": self.predictions}
48 | predictions_json = os.path.join(self.output_dir, "predictions.json")
49 | with open(predictions_json, "w") as f:
50 | f.write(json.dumps(json_data))
51 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
52 | return None
53 |
--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from codes in torch.utils.data.distributed
7 | # ------------------------------------------------------------------------
8 |
9 | import os
10 | import math
11 | import torch
12 | import torch.distributed as dist
13 | from torch.utils.data.sampler import Sampler
14 |
15 |
16 | class DistributedSampler(Sampler):
17 | """Sampler that restricts data loading to a subset of the dataset.
18 | It is especially useful in conjunction with
19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
20 | process can pass a DistributedSampler instance as a DataLoader sampler,
21 | and load a subset of the original dataset that is exclusive to it.
22 | .. note::
23 | Dataset is assumed to be of constant size.
24 | Arguments:
25 | dataset: Dataset used for sampling.
26 | num_replicas (optional): Number of processes participating in
27 | distributed training.
28 | rank (optional): Rank of the current process within num_replicas.
29 | """
30 |
31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
32 | if num_replicas is None:
33 | if not dist.is_available():
34 | raise RuntimeError("Requires distributed package to be available")
35 | num_replicas = dist.get_world_size()
36 | if rank is None:
37 | if not dist.is_available():
38 | raise RuntimeError("Requires distributed package to be available")
39 | rank = dist.get_rank()
40 | self.dataset = dataset
41 | self.num_replicas = num_replicas
42 | self.rank = rank
43 | self.epoch = 0
44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
45 | self.total_size = self.num_samples * self.num_replicas
46 | self.shuffle = shuffle
47 |
48 | def __iter__(self):
49 | if self.shuffle:
50 | # deterministically shuffle based on epoch
51 | g = torch.Generator()
52 | g.manual_seed(self.epoch)
53 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
54 | else:
55 | indices = torch.arange(len(self.dataset)).tolist()
56 |
57 | # add extra samples to make it evenly divisible
58 | indices += indices[: (self.total_size - len(indices))]
59 | assert len(indices) == self.total_size
60 |
61 | # subsample
62 | offset = self.num_samples * self.rank
63 | indices = indices[offset : offset + self.num_samples]
64 | assert len(indices) == self.num_samples
65 |
66 | return iter(indices)
67 |
68 | def __len__(self):
69 | return self.num_samples
70 |
71 | def set_epoch(self, epoch):
72 | self.epoch = epoch
73 |
74 |
75 | class NodeDistributedSampler(Sampler):
76 | """Sampler that restricts data loading to a subset of the dataset.
77 | It is especially useful in conjunction with
78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
79 | process can pass a DistributedSampler instance as a DataLoader sampler,
80 | and load a subset of the original dataset that is exclusive to it.
81 | .. note::
82 | Dataset is assumed to be of constant size.
83 | Arguments:
84 | dataset: Dataset used for sampling.
85 | num_replicas (optional): Number of processes participating in
86 | distributed training.
87 | rank (optional): Rank of the current process within num_replicas.
88 | """
89 |
90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
91 | if num_replicas is None:
92 | if not dist.is_available():
93 | raise RuntimeError("Requires distributed package to be available")
94 | num_replicas = dist.get_world_size()
95 | if rank is None:
96 | if not dist.is_available():
97 | raise RuntimeError("Requires distributed package to be available")
98 | rank = dist.get_rank()
99 | if local_rank is None:
100 | local_rank = int(os.environ.get('LOCAL_RANK', 0))
101 | if local_size is None:
102 | local_size = int(os.environ.get('LOCAL_SIZE', 1))
103 | self.dataset = dataset
104 | self.shuffle = shuffle
105 | self.num_replicas = num_replicas
106 | self.num_parts = local_size
107 | self.rank = rank
108 | self.local_rank = local_rank
109 | self.epoch = 0
110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
111 | self.total_size = self.num_samples * self.num_replicas
112 |
113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
114 |
115 | def __iter__(self):
116 | if self.shuffle:
117 | # deterministically shuffle based on epoch
118 | g = torch.Generator()
119 | g.manual_seed(self.epoch)
120 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
121 | else:
122 | indices = torch.arange(len(self.dataset)).tolist()
123 | indices = [i for i in indices if i % self.num_parts == self.local_rank]
124 |
125 | # add extra samples to make it evenly divisible
126 | indices += indices[:(self.total_size_parts - len(indices))]
127 | assert len(indices) == self.total_size_parts
128 |
129 | # subsample
130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
131 | assert len(indices) == self.num_samples
132 |
133 | return iter(indices)
134 |
135 | def __len__(self):
136 | return self.num_samples
137 |
138 | def set_epoch(self, epoch):
139 | self.epoch = epoch
140 |
141 |
--------------------------------------------------------------------------------
/datasets/torchvision_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 |
7 | from .coco import CocoDetection
8 |
--------------------------------------------------------------------------------
/datasets/torchvision_datasets/coco.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from torchvision
7 | # ------------------------------------------------------------------------
8 |
9 | """
10 | Copy-Paste from torchvision, but add utility of caching images on memory
11 | """
12 | from torchvision.datasets.vision import VisionDataset
13 | from PIL import Image
14 | import os
15 | import os.path
16 | import tqdm
17 | from io import BytesIO
18 |
19 |
20 | class CocoDetection(VisionDataset):
21 | """`MS Coco Detection `_ Dataset.
22 | Args:
23 | root (string): Root directory where images are downloaded to.
24 | annFile (string): Path to json annotation file.
25 | transform (callable, optional): A function/transform that takes in an PIL image
26 | and returns a transformed version. E.g, ``transforms.ToTensor``
27 | target_transform (callable, optional): A function/transform that takes in the
28 | target and transforms it.
29 | transforms (callable, optional): A function/transform that takes input sample and its target as entry
30 | and returns a transformed version.
31 | """
32 |
33 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None,
34 | cache_mode=False, local_rank=0, local_size=1):
35 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
36 | from pycocotools.coco import COCO
37 | self.coco = COCO(annFile)
38 | self.ids = list(sorted(self.coco.imgs.keys()))
39 | self.cache_mode = cache_mode
40 | self.local_rank = local_rank
41 | self.local_size = local_size
42 | if cache_mode:
43 | self.cache = {}
44 | self.cache_images()
45 |
46 | def cache_images(self):
47 | self.cache = {}
48 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids):
49 | if index % self.local_size != self.local_rank:
50 | continue
51 | path = self.coco.loadImgs(img_id)[0]['file_name']
52 | with open(os.path.join(self.root, path), 'rb') as f:
53 | self.cache[path] = f.read()
54 |
55 | def get_image(self, path):
56 | if self.cache_mode:
57 | if path not in self.cache.keys():
58 | with open(os.path.join(self.root, path), 'rb') as f:
59 | self.cache[path] = f.read()
60 | return Image.open(BytesIO(self.cache[path])).convert('RGB')
61 | return Image.open(os.path.join(self.root, path)).convert('RGB')
62 |
63 | def __getitem__(self, index):
64 | """
65 | Args:
66 | index (int): Index
67 | Returns:
68 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
69 | """
70 | coco = self.coco
71 | img_id = self.ids[index]
72 | ann_ids = coco.getAnnIds(imgIds=img_id)
73 | target = coco.loadAnns(ann_ids)
74 |
75 | path = coco.loadImgs(img_id)[0]['file_name']
76 |
77 | img = self.get_image(path)
78 | if self.transforms is not None:
79 | img, target = self.transforms(img, target)
80 |
81 | return img, target, path
82 |
83 | def __len__(self):
84 | return len(self.ids)
85 |
--------------------------------------------------------------------------------
/eval_hqvis.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | from pycocotools.ytvos import YTVOS
5 | from pycocotools.ytvoseval import YTVOSeval
6 |
7 | import warnings
8 | warnings.filterwarnings("ignore", category=DeprecationWarning)
9 |
10 |
11 | def ytvos_eval(result_file, result_types, ytvos, get_boundary_out, max_dets=(100, 300, 1000)):
12 |
13 | ytvos = YTVOS(ytvos, get_boundary=get_boundary_out)
14 | assert isinstance(ytvos, YTVOS)
15 |
16 | if len(ytvos.anns) == 0:
17 | print("Annotations does not exist")
18 | return
19 |
20 | assert result_file.endswith('.json')
21 | ytvos_dets = ytvos.loadRes(result_file)
22 |
23 | vid_ids = ytvos.getVidIds()
24 | for res_type in result_types:
25 | iou_type = res_type
26 | ytvosEval = YTVOSeval(ytvos, ytvos_dets, iou_type)
27 | ytvosEval.params.vidIds = vid_ids
28 | if res_type == 'proposal':
29 | ytvosEval.params.useCats = 0
30 | ytvosEval.params.maxDets = list(max_dets)
31 | ytvosEval.evaluate()
32 | ytvosEval.accumulate()
33 | ytvosEval.summarize()
34 |
35 | def main(args):
36 | result_file = args.save_path
37 | ytvos = 'ytvos'
38 | ytvos_eval(result_file, ['boundary'], 'ytvis/annotations/ytvis_hq-test.json', True, max_dets=(100, 300, 1000))
39 | ytvos_eval(result_file, ['segm'], 'ytvis/annotations/ytvis_hq-test.json', False, max_dets=(100, 300, 1000))
40 |
41 | if __name__ == '__main__':
42 | parser = argparse.ArgumentParser('inference script')
43 | parser.add_argument('--save-path')
44 | args = parser.parse_args()
45 | main(args)
46 |
--------------------------------------------------------------------------------
/figures/data1_new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/data1_new.gif
--------------------------------------------------------------------------------
/figures/dataset_compare_s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/dataset_compare_s.png
--------------------------------------------------------------------------------
/figures/result_demo1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/result_demo1.gif
--------------------------------------------------------------------------------
/figures/vmt_banner_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/vmt_banner_img.png
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .vmt import build
2 |
3 |
4 | def build_model(args):
5 | return build(args)
6 |
7 |
--------------------------------------------------------------------------------
/models/backbone.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | Backbone modules.
4 | """
5 | from collections import OrderedDict
6 |
7 | import torch
8 | import torch.nn.functional as F
9 | import torchvision
10 | from torch import nn
11 | from torchvision.models._utils import IntermediateLayerGetter
12 | from typing import Dict, List
13 |
14 | from util.misc import NestedTensor, is_main_process
15 |
16 | from .position_encoding import build_position_encoding
17 | from .x101_64d import resnext101_64x4d
18 |
19 | class FrozenBatchNorm2d(torch.nn.Module):
20 | """
21 | BatchNorm2d where the batch statistics and the affine parameters are fixed.
22 |
23 | Copy-paste from torchvision.misc.ops with added eps before rqsrt,
24 | without which any other models than torchvision.models.resnet[18,34,50,101]
25 | produce nans.
26 | """
27 |
28 | def __init__(self, n, eps=1e-5):
29 | super(FrozenBatchNorm2d, self).__init__()
30 | self.register_buffer("weight", torch.ones(n))
31 | self.register_buffer("bias", torch.zeros(n))
32 | self.register_buffer("running_mean", torch.zeros(n))
33 | self.register_buffer("running_var", torch.ones(n))
34 | self.eps = eps
35 |
36 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
37 | missing_keys, unexpected_keys, error_msgs):
38 | num_batches_tracked_key = prefix + 'num_batches_tracked'
39 | if num_batches_tracked_key in state_dict:
40 | del state_dict[num_batches_tracked_key]
41 |
42 | super(FrozenBatchNorm2d, self)._load_from_state_dict(
43 | state_dict, prefix, local_metadata, strict,
44 | missing_keys, unexpected_keys, error_msgs)
45 |
46 | def forward(self, x):
47 | # move reshapes to the beginning
48 | # to make it fuser-friendly
49 | w = self.weight.reshape(1, -1, 1, 1)
50 | b = self.bias.reshape(1, -1, 1, 1)
51 | rv = self.running_var.reshape(1, -1, 1, 1)
52 | rm = self.running_mean.reshape(1, -1, 1, 1)
53 | eps = self.eps
54 | scale = w * (rv + eps).rsqrt()
55 | bias = b - rm * scale
56 | return x * scale + bias
57 |
58 |
59 | class BackboneBase(nn.Module):
60 |
61 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
62 | super().__init__()
63 | for name, parameter in backbone.named_parameters():
64 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
65 | parameter.requires_grad_(False)
66 | if return_interm_layers:
67 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3", "conv1": "4"}
68 | self.strides = [8, 16, 32]
69 | self.num_channels = [512, 1024, 2048]
70 | else:
71 | return_layers = {'layer4': "0"}
72 | self.strides = [32]
73 | self.num_channels = [2048]
74 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
75 |
76 | def forward(self, tensor_list: NestedTensor):
77 | xs = self.body(tensor_list.tensors)
78 | out: Dict[str, NestedTensor] = {}
79 | for name, x in xs.items():
80 | m = tensor_list.mask
81 | assert m is not None
82 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
83 | out[name] = NestedTensor(x, mask)
84 | return out
85 |
86 |
87 | class Backbone(BackboneBase):
88 | """ResNet backbone with frozen BatchNorm."""
89 | def __init__(self, name: str,
90 | train_backbone: bool,
91 | return_interm_layers: bool,
92 | dilation: bool):
93 | norm_layer = FrozenBatchNorm2d
94 | if name == 'resnext101_64x4d':
95 | backbone = resnext101_64x4d(replace_stride_with_dilation=[False, False, dilation],
96 | pretrained=is_main_process(), norm_layer=norm_layer)
97 | else:
98 | backbone = getattr(torchvision.models, name)(
99 | replace_stride_with_dilation=[False, False, dilation],
100 | pretrained=is_main_process(), norm_layer=norm_layer) #pretrained=is_main_process()
101 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
102 | super().__init__(backbone, train_backbone, return_interm_layers)
103 | if dilation:
104 | self.strides[-1] = self.strides[-1] // 2
105 |
106 |
107 | class Joiner(nn.Sequential):
108 | def __init__(self, backbone, position_embedding):
109 | super().__init__(backbone, position_embedding)
110 | self.strides = backbone.strides
111 | self.num_channels = backbone.num_channels
112 |
113 | def forward(self, tensor_list: NestedTensor):
114 | xs = self[0](tensor_list)
115 | out: List[NestedTensor] = []
116 | pos = []
117 | for name, x in sorted(xs.items()):
118 | out.append(x)
119 |
120 | # position encoding
121 | for x in out:
122 | pos.append(self[1](x).to(x.tensors.dtype))
123 |
124 | return out, pos
125 |
126 |
127 | def build_backbone(args):
128 | position_embedding = build_position_encoding(args)
129 | train_backbone = args.lr_backbone > 0
130 | return_interm_layers = args.masks or (args.num_feature_levels > 1)
131 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
132 | model = Joiner(backbone, position_embedding)
133 | return model
134 |
135 |
--------------------------------------------------------------------------------
/models/matcher.py:
--------------------------------------------------------------------------------
1 | """
2 | Modules to compute the matching cost and solve the corresponding LSAP.
3 | """
4 | import torch
5 | from scipy.optimize import linear_sum_assignment
6 | from torch import nn
7 | import torch.nn.functional as F
8 |
9 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou
10 |
11 |
12 | class HungarianMatcher(nn.Module):
13 |
14 |
15 | def __init__(self,
16 | multi_frame: bool,
17 | cost_class: float = 1,
18 | cost_bbox: float = 1,
19 | cost_giou: float = 1,
20 | cost_mask: float = 1):
21 | """Creates the matcher
22 | Params:
23 | cost_class: This is the relative weight of the classification error in the matching cost
24 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
25 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
26 | """
27 | super().__init__()
28 | self.multi_frame = multi_frame
29 | self.cost_class = cost_class
30 | self.cost_bbox = cost_bbox
31 | self.cost_giou = cost_giou
32 | self.cost_mask = cost_mask
33 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_mask != 0, "all costs cant be 0"
34 |
35 |
36 | def forward(self, outputs, targets, nf, valid_ratios):
37 |
38 | with torch.no_grad():
39 | bs, num_queries = outputs["pred_logits"].shape[:2]
40 |
41 | # We flatten to compute the cost matrices in a batch
42 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
43 | # Also concat the target labels and boxes
44 | tgt_ids = torch.cat([v["labels"] for v in targets])
45 | tgt_bbox = torch.cat([v["boxes"] for v in targets])
46 | num_insts = len(tgt_ids)
47 |
48 |
49 | out_bbox = outputs["pred_boxes"].permute(0,2,1,3).flatten(0, 1) # [batch_size * num_queries,nf, 4]
50 | num_insts = len(tgt_ids)
51 | tgt_bbox = tgt_bbox.reshape(num_insts,nf,4)
52 |
53 | cost_bbox = torch.cdist(out_bbox.flatten(1,2), tgt_bbox.flatten(1,2))
54 | cost_giou = 0
55 | for i in range(nf):
56 | cost_giou += -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox[:,i]),
57 | box_cxcywh_to_xyxy(tgt_bbox[:,i]))
58 | cost_giou = cost_giou/nf
59 |
60 | # Compute the classification cost.
61 | alpha = 0.25
62 | gamma = 2.0
63 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
64 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
65 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
66 |
67 | # Final cost matrix
68 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
69 | C = C.view(bs, num_queries, -1).cpu()
70 |
71 | sizes = [len(v["labels"]) for v in targets]
72 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
73 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
74 |
75 |
76 | def build_matcher(args):
77 | # output single frame, multi frame
78 | return HungarianMatcher(multi_frame=True, # True, False
79 | cost_class=args.set_cost_class,
80 | cost_bbox=args.set_cost_bbox,
81 | cost_giou=args.set_cost_giou)
82 |
83 |
84 |
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: MultiScaleDeformableAttention
3 | Version: 1.0
4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
6 | Author: Weijie Su
7 | License: UNKNOWN
8 | Platform: UNKNOWN
9 |
10 | UNKNOWN
11 |
12 |
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | /scratch/work/vmt_organized_code/models/ops/src/vision.cpp
3 | /scratch/work/vmt_organized_code/models/ops/src/cpu/ms_deform_attn_cpu.cpp
4 | /scratch/work/vmt_organized_code/models/ops/src/cuda/ms_deform_attn_cuda.cu
5 | MultiScaleDeformableAttention.egg-info/PKG-INFO
6 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
7 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
8 | MultiScaleDeformableAttention.egg-info/top_level.txt
9 | functions/__init__.py
10 | functions/ms_deform_attn_func.py
11 | modules/__init__.py
12 | modules/ms_deform_attn.py
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 |
--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 |
11 |
--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 |
18 | import MultiScaleDeformableAttention as MSDA
19 |
20 |
21 | class MSDeformAttnFunction(Function):
22 | @staticmethod
23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 | ctx.im2col_step = im2col_step
25 | output = MSDA.ms_deform_attn_forward(
26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 | return output
29 |
30 | @staticmethod
31 | @once_differentiable
32 | def backward(ctx, grad_output):
33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 | grad_output = grad_output.contiguous()
35 | grad_value, grad_sampling_loc, grad_attn_weight = \
36 | MSDA.ms_deform_attn_backward(
37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
38 |
39 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
40 |
41 |
42 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
43 | # for debug and test only,
44 | # need to use cuda version instead
45 | N_, S_, M_, D_ = value.shape
46 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
47 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
48 | sampling_grids = 2 * sampling_locations - 1
49 | sampling_value_list = []
50 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
51 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
52 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
53 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
54 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
55 | # N_*M_, D_, Lq_, P_
56 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
57 | mode='bilinear', padding_mode='zeros', align_corners=False)
58 | sampling_value_list.append(sampling_value_l_)
59 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
60 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
61 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
62 | return output.transpose(1, 2).contiguous()
63 |
--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | # ------------------------------------------------------------------------------------------------
9 |
10 | python3 setup.py build install --user
11 |
--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | # Modified from DETR (https://github.com/facebookresearch/detr)
6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
7 | # ------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn import MSDeformAttn
--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | import os
10 | import glob
11 |
12 | import torch
13 |
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 |
18 | from setuptools import find_packages
19 | from setuptools import setup
20 |
21 | requirements = ["torch", "torchvision"]
22 |
23 | def get_extensions():
24 | this_dir = os.path.dirname(os.path.abspath(__file__))
25 | extensions_dir = os.path.join(this_dir, "src")
26 |
27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 |
31 | sources = main_file + source_cpu
32 | extension = CppExtension
33 | extra_compile_args = {"cxx": []}
34 | define_macros = []
35 |
36 | if torch.cuda.is_available() and CUDA_HOME is not None:
37 | extension = CUDAExtension
38 | sources += source_cuda
39 | define_macros += [("WITH_CUDA", None)]
40 | extra_compile_args["nvcc"] = [
41 | "-DCUDA_HAS_FP16=1",
42 | "-D__CUDA_NO_HALF_OPERATORS__",
43 | "-D__CUDA_NO_HALF_CONVERSIONS__",
44 | "-D__CUDA_NO_HALF2_OPERATORS__",
45 | ]
46 | else:
47 | raise NotImplementedError('Cuda is not availabel')
48 |
49 | sources = [os.path.join(extensions_dir, s) for s in sources]
50 | include_dirs = [extensions_dir]
51 | ext_modules = [
52 | extension(
53 | "MultiScaleDeformableAttention",
54 | sources,
55 | include_dirs=include_dirs,
56 | define_macros=define_macros,
57 | extra_compile_args=extra_compile_args,
58 | )
59 | ]
60 | return ext_modules
61 |
62 | setup(
63 | name="MultiScaleDeformableAttention",
64 | version="1.0",
65 | author="Weijie Su",
66 | url="https://github.com/fundamentalvision/Deformable-DETR",
67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 | packages=find_packages(exclude=("configs", "tests",)),
69 | ext_modules=get_extensions(),
70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 |
--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 |
13 | #include
14 | #include
15 |
16 |
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 | const at::Tensor &value,
20 | const at::Tensor &spatial_shapes,
21 | const at::Tensor &level_start_index,
22 | const at::Tensor &sampling_loc,
23 | const at::Tensor &attn_weight,
24 | const int im2col_step)
25 | {
26 | AT_ERROR("Not implement on cpu");
27 | }
28 |
29 | std::vector
30 | ms_deform_attn_cpu_backward(
31 | const at::Tensor &value,
32 | const at::Tensor &spatial_shapes,
33 | const at::Tensor &level_start_index,
34 | const at::Tensor &sampling_loc,
35 | const at::Tensor &attn_weight,
36 | const at::Tensor &grad_output,
37 | const int im2col_step)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 | const at::Tensor &value,
17 | const at::Tensor &spatial_shapes,
18 | const at::Tensor &level_start_index,
19 | const at::Tensor &sampling_loc,
20 | const at::Tensor &attn_weight,
21 | const int im2col_step);
22 |
23 | std::vector
24 | ms_deform_attn_cpu_backward(
25 | const at::Tensor &value,
26 | const at::Tensor &spatial_shapes,
27 | const at::Tensor &level_start_index,
28 | const at::Tensor &sampling_loc,
29 | const at::Tensor &attn_weight,
30 | const at::Tensor &grad_output,
31 | const int im2col_step);
32 |
33 |
34 |
--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 | #include "cuda/ms_deform_im2col_cuda.cuh"
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 |
20 | at::Tensor ms_deform_attn_cuda_forward(
21 | const at::Tensor &value,
22 | const at::Tensor &spatial_shapes,
23 | const at::Tensor &level_start_index,
24 | const at::Tensor &sampling_loc,
25 | const at::Tensor &attn_weight,
26 | const int im2col_step)
27 | {
28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
33 |
34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
39 |
40 | const int batch = value.size(0);
41 | const int spatial_size = value.size(1);
42 | const int num_heads = value.size(2);
43 | const int channels = value.size(3);
44 |
45 | const int num_levels = spatial_shapes.size(0);
46 |
47 | const int num_query = sampling_loc.size(1);
48 | const int num_point = sampling_loc.size(4);
49 |
50 | const int im2col_step_ = std::min(batch, im2col_step);
51 |
52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
53 |
54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
55 |
56 | const int batch_n = im2col_step_;
57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
58 | auto per_value_size = spatial_size * num_heads * channels;
59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
61 | for (int n = 0; n < batch/im2col_step_; ++n)
62 | {
63 | auto columns = output_n.select(0, n);
64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
66 | value.data() + n * im2col_step_ * per_value_size,
67 | spatial_shapes.data(),
68 | level_start_index.data(),
69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
72 | columns.data());
73 |
74 | }));
75 | }
76 |
77 | output = output.view({batch, num_query, num_heads*channels});
78 |
79 | return output;
80 | }
81 |
82 |
83 | std::vector ms_deform_attn_cuda_backward(
84 | const at::Tensor &value,
85 | const at::Tensor &spatial_shapes,
86 | const at::Tensor &level_start_index,
87 | const at::Tensor &sampling_loc,
88 | const at::Tensor &attn_weight,
89 | const at::Tensor &grad_output,
90 | const int im2col_step)
91 | {
92 |
93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
99 |
100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 |
107 | const int batch = value.size(0);
108 | const int spatial_size = value.size(1);
109 | const int num_heads = value.size(2);
110 | const int channels = value.size(3);
111 |
112 | const int num_levels = spatial_shapes.size(0);
113 |
114 | const int num_query = sampling_loc.size(1);
115 | const int num_point = sampling_loc.size(4);
116 |
117 | const int im2col_step_ = std::min(batch, im2col_step);
118 |
119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 |
121 | auto grad_value = at::zeros_like(value);
122 | auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 | auto grad_attn_weight = at::zeros_like(attn_weight);
124 |
125 | const int batch_n = im2col_step_;
126 | auto per_value_size = spatial_size * num_heads * channels;
127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |
131 | for (int n = 0; n < batch/im2col_step_; ++n)
132 | {
133 | auto grad_output_g = grad_output_n.select(0, n);
134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 | grad_output_g.data(),
137 | value.data() + n * im2col_step_ * per_value_size,
138 | spatial_shapes.data(),
139 | level_start_index.data(),
140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 | grad_value.data() + n * im2col_step_ * per_value_size,
144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
146 |
147 | }));
148 | }
149 |
150 | return {
151 | grad_value, grad_sampling_loc, grad_attn_weight
152 | };
153 | }
--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor ms_deform_attn_cuda_forward(
15 | const at::Tensor &value,
16 | const at::Tensor &spatial_shapes,
17 | const at::Tensor &level_start_index,
18 | const at::Tensor &sampling_loc,
19 | const at::Tensor &attn_weight,
20 | const int im2col_step);
21 |
22 | std::vector ms_deform_attn_cuda_backward(
23 | const at::Tensor &value,
24 | const at::Tensor &spatial_shapes,
25 | const at::Tensor &level_start_index,
26 | const at::Tensor &sampling_loc,
27 | const at::Tensor &attn_weight,
28 | const at::Tensor &grad_output,
29 | const int im2col_step);
30 |
31 |
--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 |
13 | #include "cpu/ms_deform_attn_cpu.h"
14 |
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 |
19 |
20 | at::Tensor
21 | ms_deform_attn_forward(
22 | const at::Tensor &value,
23 | const at::Tensor &spatial_shapes,
24 | const at::Tensor &level_start_index,
25 | const at::Tensor &sampling_loc,
26 | const at::Tensor &attn_weight,
27 | const int im2col_step)
28 | {
29 | if (value.type().is_cuda())
30 | {
31 | #ifdef WITH_CUDA
32 | return ms_deform_attn_cuda_forward(
33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | ms_deform_attn_backward(
43 | const at::Tensor &value,
44 | const at::Tensor &spatial_shapes,
45 | const at::Tensor &level_start_index,
46 | const at::Tensor &sampling_loc,
47 | const at::Tensor &attn_weight,
48 | const at::Tensor &grad_output,
49 | const int im2col_step)
50 | {
51 | if (value.type().is_cuda())
52 | {
53 | #ifdef WITH_CUDA
54 | return ms_deform_attn_cuda_backward(
55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 | AT_ERROR("Not compiled with GPU support");
58 | #endif
59 | }
60 | AT_ERROR("Not implemented on the CPU");
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include "ms_deform_attn.h"
12 |
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 |
--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 |
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 |
20 |
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 |
27 |
28 | torch.manual_seed(3)
29 |
30 |
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 | value = torch.rand(N, S, M, D).cuda() * 0.01
34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 | im2col_step = 2
38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 | fwdok = torch.allclose(output_cuda, output_pytorch)
41 | max_abs_err = (output_cuda - output_pytorch).abs().max()
42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 |
44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 |
46 |
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 | value = torch.rand(N, S, M, D).cuda() * 0.01
50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 | im2col_step = 2
54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 | max_abs_err = (output_cuda - output_pytorch).abs().max()
58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 |
60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 |
62 |
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 |
65 | value = torch.rand(N, S, M, channels).cuda() * 0.01
66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 | im2col_step = 2
70 | func = MSDeformAttnFunction.apply
71 |
72 | value.requires_grad = grad_value
73 | sampling_locations.requires_grad = grad_sampling_loc
74 | attention_weights.requires_grad = grad_attn_weight
75 |
76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 |
78 | print(f'* {gradok} check_gradient_numerical(D={channels})')
79 |
80 |
81 | if __name__ == '__main__':
82 | check_forward_equal_with_pytorch_double()
83 | check_forward_equal_with_pytorch_float()
84 |
85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 | check_gradient_numerical(channels, True, True, True)
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | # Modified from DETR (https://github.com/facebookresearch/detr)
6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
7 | # ------------------------------------------------------------------------
8 |
9 |
10 | """
11 | Various positional encodings for the transformer.
12 | """
13 | import math
14 | import torch
15 | from torch import nn
16 |
17 | from util.misc import NestedTensor
18 | import numpy as np
19 |
20 |
21 | class PositionalEncoding3D(nn.Module):
22 | def __init__(self, channels):
23 | """
24 | :param channels: The last dimension of the tensor you want to apply pos emb to.
25 | """
26 | super(PositionalEncoding3D, self).__init__()
27 | channels = int(np.ceil(channels/6)*2)
28 | if channels % 2:
29 | channels += 1
30 | self.channels = channels
31 | inv_freq = 1. / (10000 ** (torch.arange(0, channels, 2).float() / channels))
32 | self.register_buffer('inv_freq', inv_freq)
33 |
34 | def forward(self, tensor):
35 | """
36 | :param tensor: A 5d tensor of size (batch_size, x, y, z, ch)
37 | :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch)
38 | """
39 | if len(tensor.shape) != 5:
40 | raise RuntimeError("The input tensor has to be 5d!")
41 | batch_size, x, y, z, orig_ch = tensor.shape
42 | # print('tensor.shape shape:', tensor.shape)
43 | pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
44 | pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
45 | pos_z = torch.arange(z, device=tensor.device).type(self.inv_freq.type())
46 | sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
47 | sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
48 | sin_inp_z = torch.einsum("i,j->ij", pos_z, self.inv_freq)
49 | emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1).unsqueeze(1).unsqueeze(1)
50 | emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1).unsqueeze(1)
51 | emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1)
52 | emb = torch.zeros((x,y,z,self.channels*3),device=tensor.device).type(tensor.type())
53 | emb[:,:,:,:self.channels] = emb_x
54 | emb[:,:,:,self.channels:2*self.channels] = emb_y
55 | emb[:,:,:,2*self.channels:] = emb_z
56 |
57 | return emb[None,:,:,:,:orig_ch].repeat(batch_size, 1, 1, 1, 1)
58 |
59 |
60 | class PositionEmbeddingSine(nn.Module):
61 | """
62 | This is a more standard version of the position embedding, very similar to the one
63 | used by the Attention is all you need paper, generalized to work on images.
64 | """
65 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
66 | super().__init__()
67 | self.num_pos_feats = num_pos_feats
68 | self.temperature = temperature
69 | self.normalize = normalize
70 | if scale is not None and normalize is False:
71 | raise ValueError("normalize should be True if scale is passed")
72 | if scale is None:
73 | scale = 2 * math.pi
74 | self.scale = scale
75 |
76 | def forward(self, tensor_list: NestedTensor):
77 | x = tensor_list.tensors
78 | mask = tensor_list.mask
79 | assert mask is not None
80 | not_mask = ~mask
81 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
82 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
83 | if self.normalize:
84 | eps = 1e-6
85 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
86 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
87 |
88 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
89 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
90 |
91 | pos_x = x_embed[:, :, :, None] / dim_t
92 | pos_y = y_embed[:, :, :, None] / dim_t
93 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
94 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
95 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
96 | return pos
97 |
98 |
99 | class PositionEmbeddingLearned(nn.Module):
100 | """
101 | Absolute pos embedding, learned.
102 | """
103 | def __init__(self, num_pos_feats=256):
104 | super().__init__()
105 | self.row_embed = nn.Embedding(50, num_pos_feats)
106 | self.col_embed = nn.Embedding(50, num_pos_feats)
107 | self.reset_parameters()
108 |
109 | def reset_parameters(self):
110 | nn.init.uniform_(self.row_embed.weight)
111 | nn.init.uniform_(self.col_embed.weight)
112 |
113 | def forward(self, tensor_list: NestedTensor):
114 | x = tensor_list.tensors
115 | h, w = x.shape[-2:]
116 | i = torch.arange(w, device=x.device)
117 | j = torch.arange(h, device=x.device)
118 | x_emb = self.col_embed(i)
119 | y_emb = self.row_embed(j)
120 | pos = torch.cat([
121 | x_emb.unsqueeze(0).repeat(h, 1, 1),
122 | y_emb.unsqueeze(1).repeat(1, w, 1),
123 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
124 | return pos
125 |
126 |
127 | def build_position_encoding(args):
128 | N_steps = args.hidden_dim // 2
129 | if args.position_embedding in ('v2', 'sine'):
130 | # TODO find a better way of exposing other arguments
131 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
132 | elif args.position_embedding in ('v3', 'learned'):
133 | position_embedding = PositionEmbeddingLearned(N_steps)
134 | else:
135 | raise ValueError(f"not supported {args.position_embedding}")
136 |
137 | return position_embedding
138 |
--------------------------------------------------------------------------------
/models/x101_64d.py:
--------------------------------------------------------------------------------
1 | from torchvision.models.resnet import _resnet, Bottleneck
2 |
3 |
4 | def resnext101_64x4d(pretrained=False, progress=True, **kwargs):
5 | r"""ResNeXt-101 64*4d model from
6 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
7 |
8 | Args:
9 | pretrained (bool): If True, returns a model pre-trained on ImageNet
10 | progress (bool): If True, displays a progress bar of the download to stderr
11 | """
12 | kwargs['groups'] = 64
13 | kwargs['width_per_group'] = 4
14 | return _resnet('resnext101_64x4d', Bottleneck, [3, 4, 23, 3],
15 | False, progress, **kwargs)
16 |
--------------------------------------------------------------------------------
/models_swin/__init__.py:
--------------------------------------------------------------------------------
1 | from .vmt import build
2 |
3 |
4 | def build_model(args):
5 | return build(args)
6 |
7 |
--------------------------------------------------------------------------------
/models_swin/backbone.py:
--------------------------------------------------------------------------------
1 | """
2 | Backbone modules.
3 | """
4 | from collections import OrderedDict
5 |
6 | import torch
7 | import torch.nn.functional as F
8 | import torchvision
9 | from torch import nn
10 | from torchvision.models._utils import IntermediateLayerGetter
11 | from typing import Dict, List
12 |
13 | from util.misc import NestedTensor, is_main_process
14 |
15 | from .position_encoding import build_position_encoding
16 | from .x101_64d import resnext101_64x4d
17 |
18 | class FrozenBatchNorm2d(torch.nn.Module):
19 | """
20 | BatchNorm2d where the batch statistics and the affine parameters are fixed.
21 |
22 | Copy-paste from torchvision.misc.ops with added eps before rqsrt,
23 | without which any other models than torchvision.models.resnet[18,34,50,101]
24 | produce nans.
25 | """
26 |
27 | def __init__(self, n, eps=1e-5):
28 | super(FrozenBatchNorm2d, self).__init__()
29 | self.register_buffer("weight", torch.ones(n))
30 | self.register_buffer("bias", torch.zeros(n))
31 | self.register_buffer("running_mean", torch.zeros(n))
32 | self.register_buffer("running_var", torch.ones(n))
33 | self.eps = eps
34 |
35 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
36 | missing_keys, unexpected_keys, error_msgs):
37 | num_batches_tracked_key = prefix + 'num_batches_tracked'
38 | if num_batches_tracked_key in state_dict:
39 | del state_dict[num_batches_tracked_key]
40 |
41 | super(FrozenBatchNorm2d, self)._load_from_state_dict(
42 | state_dict, prefix, local_metadata, strict,
43 | missing_keys, unexpected_keys, error_msgs)
44 |
45 | def forward(self, x):
46 | # move reshapes to the beginning
47 | # to make it fuser-friendly
48 | w = self.weight.reshape(1, -1, 1, 1)
49 | b = self.bias.reshape(1, -1, 1, 1)
50 | rv = self.running_var.reshape(1, -1, 1, 1)
51 | rm = self.running_mean.reshape(1, -1, 1, 1)
52 | eps = self.eps
53 | scale = w * (rv + eps).rsqrt()
54 | bias = b - rm * scale
55 | return x * scale + bias
56 |
57 |
58 | class BackboneBase(nn.Module):
59 |
60 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
61 | super().__init__()
62 | for name, parameter in backbone.named_parameters():
63 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
64 | parameter.requires_grad_(False)
65 | if return_interm_layers:
66 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3", "conv1": "4"}
67 | self.strides = [8, 16, 32]
68 | self.num_channels = [512, 1024, 2048]
69 | else:
70 | return_layers = {'layer4': "0"}
71 | self.strides = [32]
72 | self.num_channels = [2048]
73 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
74 |
75 | def forward(self, tensor_list: NestedTensor):
76 | xs = self.body(tensor_list.tensors)
77 | out: Dict[str, NestedTensor] = {}
78 | for name, x in xs.items():
79 | m = tensor_list.mask
80 | assert m is not None
81 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
82 | out[name] = NestedTensor(x, mask)
83 | return out
84 |
85 |
86 | class Backbone(BackboneBase):
87 | """ResNet backbone with frozen BatchNorm."""
88 | def __init__(self, name: str,
89 | train_backbone: bool,
90 | return_interm_layers: bool,
91 | dilation: bool):
92 | norm_layer = FrozenBatchNorm2d
93 | if name == 'resnext101_64x4d':
94 | backbone = resnext101_64x4d(replace_stride_with_dilation=[False, False, dilation],
95 | pretrained=is_main_process(), norm_layer=norm_layer)
96 | else:
97 | backbone = getattr(torchvision.models, name)(
98 | replace_stride_with_dilation=[False, False, dilation],
99 | pretrained=is_main_process(), norm_layer=norm_layer) #pretrained=is_main_process()
100 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
101 | super().__init__(backbone, train_backbone, return_interm_layers)
102 | if dilation:
103 | self.strides[-1] = self.strides[-1] // 2
104 |
105 |
106 | class Joiner(nn.Sequential):
107 | def __init__(self, backbone, position_embedding):
108 | super().__init__(backbone, position_embedding)
109 | self.strides = backbone.strides
110 | self.num_channels = backbone.num_channels
111 |
112 | def forward(self, tensor_list: NestedTensor):
113 | xs = self[0](tensor_list)
114 | out: List[NestedTensor] = []
115 | pos = []
116 | for name, x in sorted(xs.items()):
117 | out.append(x)
118 |
119 | # position encoding
120 | for x in out:
121 | pos.append(self[1](x).to(x.tensors.dtype))
122 |
123 | return out, pos
124 |
125 |
126 | def build_backbone(args):
127 | position_embedding = build_position_encoding(args)
128 | train_backbone = args.lr_backbone > 0
129 | return_interm_layers = args.masks or (args.num_feature_levels > 1)
130 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
131 | model = Joiner(backbone, position_embedding)
132 | return model
133 |
134 |
--------------------------------------------------------------------------------
/models_swin/matcher.py:
--------------------------------------------------------------------------------
1 | """
2 | Modules to compute the matching cost and solve the corresponding LSAP.
3 | """
4 | import torch
5 | from scipy.optimize import linear_sum_assignment
6 | from torch import nn
7 | import torch.nn.functional as F
8 |
9 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou
10 |
11 |
12 | class HungarianMatcher(nn.Module):
13 |
14 |
15 | def __init__(self,
16 | multi_frame: bool,
17 | cost_class: float = 1,
18 | cost_bbox: float = 1,
19 | cost_giou: float = 1,
20 | cost_mask: float = 1):
21 | """Creates the matcher
22 | Params:
23 | cost_class: This is the relative weight of the classification error in the matching cost
24 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
25 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
26 | """
27 | super().__init__()
28 | self.multi_frame = multi_frame
29 | self.cost_class = cost_class
30 | self.cost_bbox = cost_bbox
31 | self.cost_giou = cost_giou
32 | self.cost_mask = cost_mask
33 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_mask != 0, "all costs cant be 0"
34 |
35 |
36 | def forward(self, outputs, targets, nf, valid_ratios):
37 |
38 | with torch.no_grad():
39 | bs, num_queries = outputs["pred_logits"].shape[:2]
40 |
41 | # We flatten to compute the cost matrices in a batch
42 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
43 | # Also concat the target labels and boxes
44 | tgt_ids = torch.cat([v["labels"] for v in targets])
45 | tgt_bbox = torch.cat([v["boxes"] for v in targets])
46 | num_insts = len(tgt_ids)
47 |
48 |
49 | out_bbox = outputs["pred_boxes"].permute(0,2,1,3).flatten(0, 1) # [batch_size * num_queries,nf, 4]
50 | num_insts = len(tgt_ids)
51 | tgt_bbox = tgt_bbox.reshape(num_insts,nf,4)
52 |
53 | cost_bbox = torch.cdist(out_bbox.flatten(1,2), tgt_bbox.flatten(1,2))
54 | cost_giou = 0
55 | for i in range(nf):
56 | cost_giou += -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox[:,i]),
57 | box_cxcywh_to_xyxy(tgt_bbox[:,i]))
58 | cost_giou = cost_giou/nf
59 |
60 | # Compute the classification cost.
61 | alpha = 0.25
62 | gamma = 2.0
63 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
64 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
65 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
66 |
67 | # Final cost matrix
68 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
69 | C = C.view(bs, num_queries, -1).cpu()
70 |
71 | sizes = [len(v["labels"]) for v in targets]
72 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
73 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
74 |
75 |
76 | def build_matcher(args):
77 | # output single frame, multi frame
78 | return HungarianMatcher(multi_frame=True, # True, False
79 | cost_class=args.set_cost_class,
80 | cost_bbox=args.set_cost_bbox,
81 | cost_giou=args.set_cost_giou)
82 |
83 |
84 |
--------------------------------------------------------------------------------
/models_swin/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: MultiScaleDeformableAttention
3 | Version: 1.0
4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
6 | Author: Weijie Su
7 | Author-email: UNKNOWN
8 | License: UNKNOWN
9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 |
--------------------------------------------------------------------------------
/models_swin/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | MultiScaleDeformableAttention.egg-info/PKG-INFO
3 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
4 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
5 | MultiScaleDeformableAttention.egg-info/top_level.txt
6 | functions/__init__.py
7 | functions/ms_deform_attn_func.py
8 | modules/__init__.py
9 | modules/ms_deform_attn.py
--------------------------------------------------------------------------------
/models_swin/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/models_swin/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 |
--------------------------------------------------------------------------------
/models_swin/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 |
11 |
--------------------------------------------------------------------------------
/models_swin/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 |
18 | import MultiScaleDeformableAttention as MSDA
19 |
20 |
21 | class MSDeformAttnFunction(Function):
22 | @staticmethod
23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 | ctx.im2col_step = im2col_step
25 | output = MSDA.ms_deform_attn_forward(
26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 | return output
29 |
30 | @staticmethod
31 | @once_differentiable
32 | def backward(ctx, grad_output):
33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 | grad_output = grad_output.contiguous()
35 | grad_value, grad_sampling_loc, grad_attn_weight = \
36 | MSDA.ms_deform_attn_backward(
37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
38 |
39 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
40 |
41 |
42 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
43 | # for debug and test only,
44 | # need to use cuda version instead
45 | N_, S_, M_, D_ = value.shape
46 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
47 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
48 | sampling_grids = 2 * sampling_locations - 1
49 | sampling_value_list = []
50 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
51 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
52 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
53 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
54 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
55 | # N_*M_, D_, Lq_, P_
56 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
57 | mode='bilinear', padding_mode='zeros', align_corners=False)
58 | sampling_value_list.append(sampling_value_l_)
59 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
60 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
61 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
62 | return output.transpose(1, 2).contiguous()
63 |
--------------------------------------------------------------------------------
/models_swin/ops/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | # ------------------------------------------------------------------------------------------------
9 |
10 | python3 setup.py build install --user
11 |
--------------------------------------------------------------------------------
/models_swin/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | # Modified from DETR (https://github.com/facebookresearch/detr)
6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
7 | # ------------------------------------------------------------------------
8 |
9 | from .ms_deform_attn import MSDeformAttn
--------------------------------------------------------------------------------
/models_swin/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | import os
10 | import glob
11 |
12 | import torch
13 |
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 |
18 | from setuptools import find_packages
19 | from setuptools import setup
20 |
21 | requirements = ["torch", "torchvision"]
22 |
23 | def get_extensions():
24 | this_dir = os.path.dirname(os.path.abspath(__file__))
25 | extensions_dir = os.path.join(this_dir, "src")
26 |
27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 |
31 | sources = main_file + source_cpu
32 | extension = CppExtension
33 | extra_compile_args = {"cxx": []}
34 | define_macros = []
35 |
36 | if torch.cuda.is_available() and CUDA_HOME is not None:
37 | extension = CUDAExtension
38 | sources += source_cuda
39 | define_macros += [("WITH_CUDA", None)]
40 | extra_compile_args["nvcc"] = [
41 | "-DCUDA_HAS_FP16=1",
42 | "-D__CUDA_NO_HALF_OPERATORS__",
43 | "-D__CUDA_NO_HALF_CONVERSIONS__",
44 | "-D__CUDA_NO_HALF2_OPERATORS__",
45 | ]
46 | else:
47 | raise NotImplementedError('Cuda is not availabel')
48 |
49 | sources = [os.path.join(extensions_dir, s) for s in sources]
50 | include_dirs = [extensions_dir]
51 | ext_modules = [
52 | extension(
53 | "MultiScaleDeformableAttention",
54 | sources,
55 | include_dirs=include_dirs,
56 | define_macros=define_macros,
57 | extra_compile_args=extra_compile_args,
58 | )
59 | ]
60 | return ext_modules
61 |
62 | setup(
63 | name="MultiScaleDeformableAttention",
64 | version="1.0",
65 | author="Weijie Su",
66 | url="https://github.com/fundamentalvision/Deformable-DETR",
67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 | packages=find_packages(exclude=("configs", "tests",)),
69 | ext_modules=get_extensions(),
70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 |
--------------------------------------------------------------------------------
/models_swin/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 |
13 | #include
14 | #include
15 |
16 |
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 | const at::Tensor &value,
20 | const at::Tensor &spatial_shapes,
21 | const at::Tensor &level_start_index,
22 | const at::Tensor &sampling_loc,
23 | const at::Tensor &attn_weight,
24 | const int im2col_step)
25 | {
26 | AT_ERROR("Not implement on cpu");
27 | }
28 |
29 | std::vector
30 | ms_deform_attn_cpu_backward(
31 | const at::Tensor &value,
32 | const at::Tensor &spatial_shapes,
33 | const at::Tensor &level_start_index,
34 | const at::Tensor &sampling_loc,
35 | const at::Tensor &attn_weight,
36 | const at::Tensor &grad_output,
37 | const int im2col_step)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/models_swin/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 | const at::Tensor &value,
17 | const at::Tensor &spatial_shapes,
18 | const at::Tensor &level_start_index,
19 | const at::Tensor &sampling_loc,
20 | const at::Tensor &attn_weight,
21 | const int im2col_step);
22 |
23 | std::vector
24 | ms_deform_attn_cpu_backward(
25 | const at::Tensor &value,
26 | const at::Tensor &spatial_shapes,
27 | const at::Tensor &level_start_index,
28 | const at::Tensor &sampling_loc,
29 | const at::Tensor &attn_weight,
30 | const at::Tensor &grad_output,
31 | const int im2col_step);
32 |
33 |
34 |
--------------------------------------------------------------------------------
/models_swin/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include
12 | #include "cuda/ms_deform_im2col_cuda.cuh"
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 |
20 | at::Tensor ms_deform_attn_cuda_forward(
21 | const at::Tensor &value,
22 | const at::Tensor &spatial_shapes,
23 | const at::Tensor &level_start_index,
24 | const at::Tensor &sampling_loc,
25 | const at::Tensor &attn_weight,
26 | const int im2col_step)
27 | {
28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
33 |
34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
39 |
40 | const int batch = value.size(0);
41 | const int spatial_size = value.size(1);
42 | const int num_heads = value.size(2);
43 | const int channels = value.size(3);
44 |
45 | const int num_levels = spatial_shapes.size(0);
46 |
47 | const int num_query = sampling_loc.size(1);
48 | const int num_point = sampling_loc.size(4);
49 |
50 | const int im2col_step_ = std::min(batch, im2col_step);
51 |
52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
53 |
54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
55 |
56 | const int batch_n = im2col_step_;
57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
58 | auto per_value_size = spatial_size * num_heads * channels;
59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
61 | for (int n = 0; n < batch/im2col_step_; ++n)
62 | {
63 | auto columns = output_n.select(0, n);
64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
66 | value.data() + n * im2col_step_ * per_value_size,
67 | spatial_shapes.data(),
68 | level_start_index.data(),
69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
72 | columns.data());
73 |
74 | }));
75 | }
76 |
77 | output = output.view({batch, num_query, num_heads*channels});
78 |
79 | return output;
80 | }
81 |
82 |
83 | std::vector ms_deform_attn_cuda_backward(
84 | const at::Tensor &value,
85 | const at::Tensor &spatial_shapes,
86 | const at::Tensor &level_start_index,
87 | const at::Tensor &sampling_loc,
88 | const at::Tensor &attn_weight,
89 | const at::Tensor &grad_output,
90 | const int im2col_step)
91 | {
92 |
93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
99 |
100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 |
107 | const int batch = value.size(0);
108 | const int spatial_size = value.size(1);
109 | const int num_heads = value.size(2);
110 | const int channels = value.size(3);
111 |
112 | const int num_levels = spatial_shapes.size(0);
113 |
114 | const int num_query = sampling_loc.size(1);
115 | const int num_point = sampling_loc.size(4);
116 |
117 | const int im2col_step_ = std::min(batch, im2col_step);
118 |
119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 |
121 | auto grad_value = at::zeros_like(value);
122 | auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 | auto grad_attn_weight = at::zeros_like(attn_weight);
124 |
125 | const int batch_n = im2col_step_;
126 | auto per_value_size = spatial_size * num_heads * channels;
127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |
131 | for (int n = 0; n < batch/im2col_step_; ++n)
132 | {
133 | auto grad_output_g = grad_output_n.select(0, n);
134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 | grad_output_g.data(),
137 | value.data() + n * im2col_step_ * per_value_size,
138 | spatial_shapes.data(),
139 | level_start_index.data(),
140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 | grad_value.data() + n * im2col_step_ * per_value_size,
144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
146 |
147 | }));
148 | }
149 |
150 | return {
151 | grad_value, grad_sampling_loc, grad_attn_weight
152 | };
153 | }
--------------------------------------------------------------------------------
/models_swin/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 | #include
13 |
14 | at::Tensor ms_deform_attn_cuda_forward(
15 | const at::Tensor &value,
16 | const at::Tensor &spatial_shapes,
17 | const at::Tensor &level_start_index,
18 | const at::Tensor &sampling_loc,
19 | const at::Tensor &attn_weight,
20 | const int im2col_step);
21 |
22 | std::vector ms_deform_attn_cuda_backward(
23 | const at::Tensor &value,
24 | const at::Tensor &spatial_shapes,
25 | const at::Tensor &level_start_index,
26 | const at::Tensor &sampling_loc,
27 | const at::Tensor &attn_weight,
28 | const at::Tensor &grad_output,
29 | const int im2col_step);
30 |
31 |
--------------------------------------------------------------------------------
/models_swin/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #pragma once
12 |
13 | #include "cpu/ms_deform_attn_cpu.h"
14 |
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 |
19 |
20 | at::Tensor
21 | ms_deform_attn_forward(
22 | const at::Tensor &value,
23 | const at::Tensor &spatial_shapes,
24 | const at::Tensor &level_start_index,
25 | const at::Tensor &sampling_loc,
26 | const at::Tensor &attn_weight,
27 | const int im2col_step)
28 | {
29 | if (value.type().is_cuda())
30 | {
31 | #ifdef WITH_CUDA
32 | return ms_deform_attn_cuda_forward(
33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | ms_deform_attn_backward(
43 | const at::Tensor &value,
44 | const at::Tensor &spatial_shapes,
45 | const at::Tensor &level_start_index,
46 | const at::Tensor &sampling_loc,
47 | const at::Tensor &attn_weight,
48 | const at::Tensor &grad_output,
49 | const int im2col_step)
50 | {
51 | if (value.type().is_cuda())
52 | {
53 | #ifdef WITH_CUDA
54 | return ms_deform_attn_cuda_backward(
55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 | AT_ERROR("Not compiled with GPU support");
58 | #endif
59 | }
60 | AT_ERROR("Not implemented on the CPU");
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/models_swin/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | #include "ms_deform_attn.h"
12 |
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 |
--------------------------------------------------------------------------------
/models_swin/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 |
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 |
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 |
20 |
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 |
27 |
28 | torch.manual_seed(3)
29 |
30 |
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 | value = torch.rand(N, S, M, D).cuda() * 0.01
34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 | im2col_step = 2
38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 | fwdok = torch.allclose(output_cuda, output_pytorch)
41 | max_abs_err = (output_cuda - output_pytorch).abs().max()
42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 |
44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 |
46 |
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 | value = torch.rand(N, S, M, D).cuda() * 0.01
50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 | im2col_step = 2
54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 | max_abs_err = (output_cuda - output_pytorch).abs().max()
58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 |
60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 |
62 |
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 |
65 | value = torch.rand(N, S, M, channels).cuda() * 0.01
66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 | im2col_step = 2
70 | func = MSDeformAttnFunction.apply
71 |
72 | value.requires_grad = grad_value
73 | sampling_locations.requires_grad = grad_sampling_loc
74 | attention_weights.requires_grad = grad_attn_weight
75 |
76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 |
78 | print(f'* {gradok} check_gradient_numerical(D={channels})')
79 |
80 |
81 | if __name__ == '__main__':
82 | check_forward_equal_with_pytorch_double()
83 | check_forward_equal_with_pytorch_float()
84 |
85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 | check_gradient_numerical(channels, True, True, True)
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/models_swin/position_encoding.py:
--------------------------------------------------------------------------------
1 | """
2 | Various positional encodings for the transformer.
3 | """
4 | import math
5 | import torch
6 | from torch import nn
7 |
8 | from util.misc import NestedTensor
9 | import numpy as np
10 |
11 |
12 | class PositionalEncoding3D(nn.Module):
13 | def __init__(self, channels):
14 | """
15 | :param channels: The last dimension of the tensor you want to apply pos emb to.
16 | """
17 | super(PositionalEncoding3D, self).__init__()
18 | channels = int(np.ceil(channels/6)*2)
19 | if channels % 2:
20 | channels += 1
21 | self.channels = channels
22 | inv_freq = 1. / (10000 ** (torch.arange(0, channels, 2).float() / channels))
23 | self.register_buffer('inv_freq', inv_freq)
24 |
25 | def forward(self, tensor):
26 | """
27 | :param tensor: A 5d tensor of size (batch_size, x, y, z, ch)
28 | :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch)
29 | """
30 | if len(tensor.shape) != 5:
31 | raise RuntimeError("The input tensor has to be 5d!")
32 | batch_size, x, y, z, orig_ch = tensor.shape
33 | # print('tensor.shape shape:', tensor.shape)
34 | pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
35 | pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
36 | pos_z = torch.arange(z, device=tensor.device).type(self.inv_freq.type())
37 | sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
38 | sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
39 | sin_inp_z = torch.einsum("i,j->ij", pos_z, self.inv_freq)
40 | emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1).unsqueeze(1).unsqueeze(1)
41 | emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1).unsqueeze(1)
42 | emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1)
43 | emb = torch.zeros((x,y,z,self.channels*3),device=tensor.device).type(tensor.type())
44 | emb[:,:,:,:self.channels] = emb_x
45 | emb[:,:,:,self.channels:2*self.channels] = emb_y
46 | emb[:,:,:,2*self.channels:] = emb_z
47 |
48 | return emb[None,:,:,:,:orig_ch].repeat(batch_size, 1, 1, 1, 1)
49 |
50 |
51 | class PositionEmbeddingSine(nn.Module):
52 | """
53 | This is a more standard version of the position embedding, very similar to the one
54 | used by the Attention is all you need paper, generalized to work on images.
55 | """
56 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
57 | super().__init__()
58 | self.num_pos_feats = num_pos_feats
59 | self.temperature = temperature
60 | self.normalize = normalize
61 | if scale is not None and normalize is False:
62 | raise ValueError("normalize should be True if scale is passed")
63 | if scale is None:
64 | scale = 2 * math.pi
65 | self.scale = scale
66 |
67 | def forward(self, tensor_list: NestedTensor):
68 | x = tensor_list.tensors
69 | mask = tensor_list.mask
70 | assert mask is not None
71 | not_mask = ~mask
72 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
73 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
74 | if self.normalize:
75 | eps = 1e-6
76 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
77 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
78 |
79 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
80 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
81 |
82 | pos_x = x_embed[:, :, :, None] / dim_t
83 | pos_y = y_embed[:, :, :, None] / dim_t
84 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
85 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
86 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
87 | return pos
88 |
89 |
90 | class PositionEmbeddingLearned(nn.Module):
91 | """
92 | Absolute pos embedding, learned.
93 | """
94 | def __init__(self, num_pos_feats=256):
95 | super().__init__()
96 | self.row_embed = nn.Embedding(50, num_pos_feats)
97 | self.col_embed = nn.Embedding(50, num_pos_feats)
98 | self.reset_parameters()
99 |
100 | def reset_parameters(self):
101 | nn.init.uniform_(self.row_embed.weight)
102 | nn.init.uniform_(self.col_embed.weight)
103 |
104 | def forward(self, tensor_list: NestedTensor):
105 | x = tensor_list.tensors
106 | h, w = x.shape[-2:]
107 | i = torch.arange(w, device=x.device)
108 | j = torch.arange(h, device=x.device)
109 | x_emb = self.col_embed(i)
110 | y_emb = self.row_embed(j)
111 | pos = torch.cat([
112 | x_emb.unsqueeze(0).repeat(h, 1, 1),
113 | y_emb.unsqueeze(1).repeat(1, w, 1),
114 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
115 | return pos
116 |
117 |
118 | def build_position_encoding(args):
119 | N_steps = args.hidden_dim // 2
120 | if args.position_embedding in ('v2', 'sine'):
121 | # TODO find a better way of exposing other arguments
122 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
123 | elif args.position_embedding in ('v3', 'learned'):
124 | position_embedding = PositionEmbeddingLearned(N_steps)
125 | else:
126 | raise ValueError(f"not supported {args.position_embedding}")
127 |
128 | return position_embedding
129 |
--------------------------------------------------------------------------------
/models_swin/x101_64d.py:
--------------------------------------------------------------------------------
1 | from torchvision.models.resnet import _resnet, Bottleneck
2 |
3 |
4 | def resnext101_64x4d(pretrained=False, progress=True, **kwargs):
5 | r"""ResNeXt-101 64*4d model from
6 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
7 |
8 | Args:
9 | pretrained (bool): If True, returns a model pre-trained on ImageNet
10 | progress (bool): If True, displays a progress bar of the download to stderr
11 | """
12 | kwargs['groups'] = 64
13 | kwargs['width_per_group'] = 4
14 | return _resnet('resnext101_64x4d', Bottleneck, [3, 4, 23, 3],
15 | False, progress, **kwargs)
16 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 | tqdm
3 | cython
4 | scipy
5 | timm
6 | imgaug
7 | opencv-python
8 | kornia==0.5.11
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/scripts/eval_r101_test.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_test --masks --backbone resnet101 --model_path ./pretrained_model/checkpoint_r101_final.pth --save_path exp_r101_hq_test_result.json
2 |
--------------------------------------------------------------------------------
/scripts/eval_r101_val.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference --masks --backbone resnet101 --model_path ./pretrained_model/checkpoint_r101_final.pth --save_path exp_r101_hq_val_result.json
2 |
--------------------------------------------------------------------------------
/scripts/eval_r50_test.sh:
--------------------------------------------------------------------------------
1 | python3 -m tools.inference_test --masks --backbone resnet50 --model_path ./pretrained_model/checkpoint_r50_final.pth --save_path exp_r50_hq_test_result.json
2 |
--------------------------------------------------------------------------------
/scripts/eval_r50_val.sh:
--------------------------------------------------------------------------------
1 | python3 -m tools.inference --masks --backbone resnet50 --model_path ./pretrained_model/checkpoint_r50_final.pth --save_path exp_r50_hq_val_result.json
--------------------------------------------------------------------------------
/scripts/eval_swin_test.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_swin_test --masks --backbone swin_l_p4w12 --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_test_result.json
2 |
3 |
4 |
--------------------------------------------------------------------------------
/scripts/eval_swin_val.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_swin --masks --backbone swin_l_p4w12 --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json
2 |
3 |
4 |
--------------------------------------------------------------------------------
/scripts/eval_swin_val_vis.sh:
--------------------------------------------------------------------------------
1 | python3 -m tools.inference_swin_with_vis --masks --backbone swin_l_p4w12 --output vis_output_swin_vmt --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json --save-frames True
2 |
3 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/tools/__init__.py
--------------------------------------------------------------------------------
/tools/visualizer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.colors as mplc
4 |
5 | from detectron2.utils.visualizer import ColorMode, GenericMask, Visualizer, _create_text_labels
6 |
7 |
8 | _ID_JITTERS = [[0.9047944201469568, 0.3241718265806123, 0.33443746665210006], [0.4590171386127151, 0.9095038146383864, 0.3143840671974788], [0.4769356899795538, 0.5044406738441948, 0.5354530846360839], [0.00820945625670777, 0.24099210193126785, 0.15471834055332978], [0.6195684374237388, 0.4020380013509799, 0.26100266066404676], [0.08281237756545068, 0.05900744492710419, 0.06106221202154216], [0.2264886829978755, 0.04925271007292076, 0.10214429345996079], [0.1888247470009874, 0.11275000298612425, 0.46112894830685514], [0.37415767691880975, 0.844284596118331, 0.950471611180866], [0.3817344218157631, 0.3483259270707101, 0.6572989333690541], [0.2403115731054466, 0.03078280287279167, 0.5385975692534737], [0.7035076951650824, 0.12352084932325424, 0.12873080308790197], [0.12607434914489934, 0.111244793010015, 0.09333334699716023], [0.6551607300342269, 0.7003064103554443, 0.4131794512286162], [0.13592107365596595, 0.5390702818232149, 0.004540643174930525], [0.38286244894454347, 0.709142545393449, 0.529074791609835], [0.4279376583651734, 0.5634708596431771, 0.8505569717104301], [0.3460488523902999, 0.464769595519293, 0.6676839675477276], [0.8544063246675081, 0.5041190233407755, 0.9081217697141578], [0.9207009090747208, 0.2403865944739051, 0.05375410999863772], [0.6515786136947107, 0.6299918449948327, 0.45292029442034387], [0.986174217295693, 0.2424849846977214, 0.3981993323108266], [0.22101915872994693, 0.3408589198278038, 0.006381420347677524], [0.3159785813515982, 0.1145748921741011, 0.595754317197274], [0.10263421488052715, 0.5864139253490858, 0.23908000741142432], [0.8272999391532938, 0.6123527260897751, 0.3365197327803193], [0.5269583712937912, 0.25668929554516506, 0.7888411215078127], [0.2433880265410031, 0.7240751234287827, 0.8483215810528648], [0.7254601709704898, 0.8316525547295984, 0.9325253855921963], [0.5574483824856672, 0.2935331727879944, 0.6594839453793155], [0.6209642371433579, 0.054030693198821256, 0.5080873988178534], [0.9055507077365624, 0.12865888619203514, 0.9309191861440005], [0.9914469722960537, 0.3074114506206205, 0.8762107657323488], [0.4812682518247371, 0.15055826298548158, 0.9656340505308308], [0.6459219454316445, 0.9144794010251625, 0.751338812155106], [0.860840174209798, 0.8844626353077639, 0.3604624506769899], [0.8194991672032272, 0.926399617787601, 0.8059222327343247], [0.6540413175393658, 0.04579445254618297, 0.26891917826531275], [0.37778835833987046, 0.36247927666109536, 0.7989799305827889], [0.22738304978177726, 0.9038018263773739, 0.6970838854138303], [0.6362015495896184, 0.527680794236961, 0.5570915425178721], [0.6436401915860954, 0.6316925317144524, 0.9137151236993912], [0.04161828388587163, 0.3832413349082706, 0.6880829921949752], [0.7768167825719299, 0.8933821497682587, 0.7221278391266809], [0.8632760876301346, 0.3278628094906323, 0.8421587587114462], [0.8556499133262127, 0.6497385872901932, 0.5436895688477963], [0.9861940318610894, 0.03562313777386272, 0.9183454677106616], [0.8042586091176366, 0.6167222703170994, 0.24181981557207644], [0.9504247117633057, 0.3454233714011461, 0.6883727005547743], [0.9611909135491202, 0.46384154263898114, 0.32700443315058914], [0.523542176970206, 0.446222414615845, 0.9067402987747814], [0.7536954008682911, 0.6675512338797588, 0.22538238957839196], [0.1554052265688285, 0.05746097492966129, 0.8580358872587424], [0.8540838640971405, 0.9165504335482566, 0.6806982829158964], [0.7065090319405029, 0.8683059983962002, 0.05167128320624026], [0.39134812961899124, 0.8910075505622979, 0.7639815712623922], [0.1578117311479783, 0.20047326898284668, 0.9220177338840568], [0.2017488993096358, 0.6949259970936679, 0.8729196864798128], [0.5591089340651949, 0.15576770423813258, 0.1469857469387812], [0.14510398622626974, 0.24451497734532168, 0.46574271993578786], [0.13286397822351492, 0.4178244533944635, 0.03728728952131943], [0.556463206310225, 0.14027595183361663, 0.2731537988657907], [0.4093837966398032, 0.8015225687789814, 0.8033567296903834], [0.527442563956637, 0.902232617214431, 0.7066626674362227], [0.9058355503297827, 0.34983989180213004, 0.8353262183839384], [0.7108382186953104, 0.08591307895133471, 0.21434688012521974], [0.22757345065207668, 0.7943075496583976, 0.2992305547627421], [0.20454109788173636, 0.8251670332103687, 0.012981987094547232], [0.7672562637297392, 0.005429019973062554, 0.022163616037108702], [0.37487345910117564, 0.5086240194440863, 0.9061216063654387], [0.9878004014101087, 0.006345852772772331, 0.17499753379350858], [0.030061528704491303, 0.1409704315546606, 0.3337131835834506], [0.5022506782611504, 0.5448435505388706, 0.40584238936140726], [0.39560774627423445, 0.8905943695833262, 0.5850815030921116], [0.058615671926786406, 0.5365713844300387, 0.1620457551256279], [0.41843842882069693, 0.1536005983609976, 0.3127878501592438], [0.05947621790155899, 0.5412421167331932, 0.2611322146455659], [0.5196159938235607, 0.7066461551682705, 0.970261497412556], [0.30443031606149007, 0.45158581060034975, 0.4331841153149706], [0.8848298403933996, 0.7241791700943656, 0.8917110054596072], [0.5720260591898779, 0.3072801598203052, 0.8891066705989902], [0.13964015336177327, 0.2531778096760302, 0.5703756837403124], [0.2156307542329836, 0.4139947500641685, 0.87051676884144], [0.10800455881891169, 0.05554646035458266, 0.2947027428551443], [0.35198009410633857, 0.365849666213808, 0.06525787683513773], [0.5223264108118847, 0.9032195574351178, 0.28579084943315025], [0.7607724246546966, 0.3087194381828555, 0.6253235528354899], [0.5060485442077824, 0.19173600467625274, 0.9931175692203702], [0.5131805830323746, 0.07719515392040577, 0.923212006754969], [0.3629762141280106, 0.02429179642710888, 0.6963754952399983], [0.7542592485456767, 0.6478893299494212, 0.3424965345400731], [0.49944574453364454, 0.6775665366832825, 0.33758796076989583], [0.010621818120767679, 0.8221571611173205, 0.5186257457566332], [0.5857910304290109, 0.7178133992025467, 0.9729243483606071], [0.16987399482717613, 0.9942570210657463, 0.18120758122552927], [0.016362572521240848, 0.17582788603087263, 0.7255176922640298], [0.10981764283706419, 0.9078582203470377, 0.7638063718334003], [0.9252097840441119, 0.3330197086990039, 0.27888705301420136], [0.12769972651171546, 0.11121470804891687, 0.12710743734391716], [0.5753520518360334, 0.2763862879599456, 0.6115636613363361]]
9 |
10 |
11 | class TrackVisualizer(Visualizer):
12 | def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
13 | super().__init__(
14 | img_rgb, metadata=metadata, scale=scale, instance_mode=instance_mode
15 | )
16 | self.cpu_device = torch.device("cpu")
17 |
18 | def _jitter(self, color, id):
19 | """
20 | Randomly modifies given color to produce a slightly different color than the color given.
21 |
22 | Args:
23 | color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
24 | picked. The values in the list are in the [0.0, 1.0] range.
25 |
26 | Returns:
27 | jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
28 | color after being jittered. The values in the list are in the [0.0, 1.0] range.
29 | """
30 | color = mplc.to_rgb(color)
31 | vec = _ID_JITTERS[id]
32 | # better to do it in another color space
33 | vec = vec / np.linalg.norm(vec) * 0.5
34 | res = np.clip(vec + color, 0, 1)
35 | return tuple(res)
36 |
37 | def draw_instance_predictions(self, predictions):
38 | """
39 | Draw instance-level prediction results on an image.
40 |
41 | Args:
42 | predictions (Instances): the output of an instance detection/segmentation
43 | model. Following fields will be used to draw:
44 | "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
45 |
46 | Returns:
47 | output (VisImage): image object with visualizations.
48 | """
49 | preds = predictions.to(self.cpu_device)
50 |
51 | boxes = preds.pred_boxes if preds.has("pred_boxes") else None
52 | scores = preds.scores if preds.has("scores") else None
53 | classes = preds.pred_classes if preds.has("pred_classes") else None
54 | labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
55 | labels = None
56 | # print('come here=====visualizer.py')
57 | if labels is not None:
58 | labels = ["[{}] ".format(_id) + l for _id, l in enumerate(labels)]
59 |
60 | if preds.has("pred_masks"):
61 | masks = np.asarray(preds.pred_masks)
62 | #masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
63 | else:
64 | masks = None
65 |
66 | if classes is None:
67 | return self.output
68 |
69 | colors = [
70 | self._jitter([x / 255 for x in self.metadata.thing_colors[c]], id) for id, c in enumerate(classes)
71 | ]
72 | alpha = 0.5
73 |
74 | if self._instance_mode == ColorMode.IMAGE_BW:
75 | self.output.img = self._create_grayscale_image(
76 | (preds.pred_masks.any(dim=0) > 0).numpy()
77 | if preds.has("pred_masks")
78 | else None
79 | )
80 | alpha = 0.3
81 |
82 | self.overlay_instances(
83 | masks=masks,
84 | boxes=boxes,
85 | labels=labels,
86 | assigned_colors=colors,
87 | alpha=alpha,
88 | )
89 |
90 | return self.output
91 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/util/__init__.py
--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 |
6 | """
7 | Utilities for bounding box manipulation and GIoU.
8 | """
9 | import torch
10 | from torchvision.ops.boxes import box_area
11 |
12 |
13 | def box_cxcywh_to_xyxy(x):
14 | # print('box:\n', x)
15 |
16 | x_c, y_c, w, h = x.unbind(-1)
17 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
18 | (x_c + 0.5 * w), (y_c + 0.5 * h)]
19 | return torch.stack(b, dim=-1)
20 |
21 |
22 | def box_xyxy_to_cxcywh(x):
23 | x0, y0, x1, y1 = x.unbind(-1)
24 | b = [(x0 + x1) / 2, (y0 + y1) / 2,
25 | (x1 - x0), (y1 - y0)]
26 | return torch.stack(b, dim=-1)
27 |
28 |
29 | # modified from torchvision to also return the union
30 | def box_iou(boxes1, boxes2):
31 | area1 = box_area(boxes1)
32 | area2 = box_area(boxes2)
33 |
34 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
35 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
36 |
37 | wh = (rb - lt).clamp(min=0) # [N,M,2]
38 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
39 |
40 | union = area1[:, None] + area2 - inter
41 |
42 | iou = inter / union
43 | return iou, union
44 |
45 | def multi_box_iou(boxes1, boxes2):
46 | area1 = box_area(boxes1.flatten(0,1)).reshape(boxes1.shape[0], boxes1.shape[1])
47 | area2 = box_area(boxes2.flatten(0,1)).reshape(boxes2.shape[0], boxes2.shape[1])
48 |
49 | lt = torch.max(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) # [nf,N,M,2]
50 | rb = torch.min(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) # [nf,N,M,2]
51 |
52 | wh = (rb - lt).clamp(min=0) # [nf,N,M,2]
53 | inter = wh[:, :, :, 0] * wh[:, :, :, 1] # [nf,N,M]
54 |
55 | union = area1[:, :, None] + area2[:, None, :] - inter
56 |
57 | iou = inter / union
58 | return iou, union
59 |
60 | def generalized_box_iou(boxes1, boxes2):
61 | """
62 | Generalized IoU from https://giou.stanford.edu/
63 |
64 | The boxes should be in [x0, y0, x1, y1] format
65 |
66 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
67 | and M = len(boxes2)
68 | """
69 | # degenerate boxes gives inf / nan results
70 | # so do an early check
71 |
72 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
73 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
74 | iou, union = box_iou(boxes1, boxes2)
75 |
76 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
77 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
78 |
79 | wh = (rb - lt).clamp(min=0) # [N,M,2]
80 | area = wh[:, :, 0] * wh[:, :, 1]
81 |
82 | return iou - (area - union) / area
83 |
84 |
85 | def generalized_multi_box_iou(boxes1, boxes2):
86 | """
87 | Generalized IoU from https://giou.stanford.edu/
88 |
89 | The boxes should be in [x0, y0, x1, y1] format
90 | boxes1.shape = [nf, N, 4]
91 | boxes2.shape = [nf, M, 4]
92 | Returns a [nf, N, M] pairwise matrix, where N = boxes1.shape[1]
93 | and M = boxes2.shape[1]
94 | """
95 | # degenerate boxes gives inf / nan results
96 | # so do an early check
97 |
98 | assert (boxes1[:, :, 2:] >= boxes1[:, :, :2]).all()
99 | assert (boxes2[:, :, 2:] >= boxes2[:, :, :2]).all()
100 | iou, union = multi_box_iou(boxes1, boxes2)
101 |
102 | lt = torch.min(boxes1[:, :, None, :2], boxes2[:, None, :, :2])
103 | rb = torch.max(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:])
104 |
105 | wh = (rb - lt).clamp(min=0) # [nf,N,M,2]
106 | area = wh[:, :, :, 0] * wh[:, :, :, 1]
107 |
108 |
109 | return iou - (area - union) / (area + 1e-7)
110 |
111 |
112 | def masks_to_boxes(masks):
113 | """Compute the bounding boxes around the provided masks
114 |
115 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
116 |
117 | Returns a [N, 4] tensors, with the boxes in xyxy format
118 | """
119 | if masks.numel() == 0:
120 | return torch.zeros((0, 4), device=masks.device)
121 |
122 | h, w = masks.shape[-2:]
123 |
124 | y = torch.arange(0, h, dtype=torch.float, device=masks.device)
125 | x = torch.arange(0, w, dtype=torch.float, device=masks.device)
126 | y, x = torch.meshgrid(y, x)
127 |
128 | x_mask = (masks * x.unsqueeze(0))
129 | x_max = x_mask.flatten(1).max(-1)[0]
130 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
131 |
132 | y_mask = (masks * y.unsqueeze(0))
133 | y_max = y_mask.flatten(1).max(-1)[0]
134 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
135 |
136 | return torch.stack([x_min, y_min, x_max, y_max], 1)
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # ------------------------------------------------------------------------
5 | """
6 | Plotting utilities to visualize training logs.
7 | """
8 | import torch
9 | import pandas as pd
10 | import seaborn as sns
11 | import matplotlib.pyplot as plt
12 |
13 | from pathlib import Path, PurePath
14 |
15 |
16 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
17 | '''
18 | Function to plot specific fields from training log(s). Plots both training and test results.
19 |
20 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
21 | - fields = which results to plot from each log file - plots both training and test for each field.
22 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
23 | - log_name = optional, name of log file if different than default 'log.txt'.
24 |
25 | :: Outputs - matplotlib plots of results in fields, color coded for each log file.
26 | - solid lines are training results, dashed lines are test results.
27 |
28 | '''
29 | func_name = "plot_utils.py::plot_logs"
30 |
31 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
32 | # convert single Path to list to avoid 'not iterable' error
33 |
34 | if not isinstance(logs, list):
35 | if isinstance(logs, PurePath):
36 | logs = [logs]
37 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
38 | else:
39 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
40 | Expect list[Path] or single Path obj, received {type(logs)}")
41 |
42 | # verify valid dir(s) and that every item in list is Path object
43 | for i, dir in enumerate(logs):
44 | if not isinstance(dir, PurePath):
45 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
46 | if dir.exists():
47 | continue
48 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
49 |
50 | # load log file(s) and plot
51 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
52 |
53 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
54 |
55 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
56 | for j, field in enumerate(fields):
57 | if field == 'mAP':
58 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
59 | axs[j].plot(coco_eval, c=color)
60 | else:
61 | df.interpolate().ewm(com=ewm_col).mean().plot(
62 | y=[f'train_{field}', f'test_{field}'],
63 | ax=axs[j],
64 | color=[color] * 2,
65 | style=['-', '--']
66 | )
67 | for ax, field in zip(axs, fields):
68 | ax.legend([Path(p).name for p in logs])
69 | ax.set_title(field)
70 |
71 |
72 | def plot_precision_recall(files, naming_scheme='iter'):
73 | if naming_scheme == 'exp_id':
74 | # name becomes exp_id
75 | names = [f.parts[-3] for f in files]
76 | elif naming_scheme == 'iter':
77 | names = [f.stem for f in files]
78 | else:
79 | raise ValueError(f'not supported {naming_scheme}')
80 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
81 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
82 | data = torch.load(f)
83 | # precision is n_iou, n_points, n_cat, n_area, max_det
84 | precision = data['precision']
85 | recall = data['params'].recThrs
86 | scores = data['scores']
87 | # take precision for all classes, all areas and 100 detections
88 | precision = precision[0, :, :, 0, -1].mean(1)
89 | scores = scores[0, :, :, 0, -1].mean(1)
90 | prec = precision.mean()
91 | rec = data['recall'][0, :, 0, -1].mean()
92 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
93 | f'score={scores.mean():0.3f}, ' +
94 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
95 | )
96 | axs[0].plot(recall, precision, c=color)
97 | axs[1].plot(recall, scores, c=color)
98 |
99 | axs[0].set_title('Precision / Recall')
100 | axs[0].legend(names)
101 | axs[1].set_title('Scores / Recall')
102 | axs[1].legend(names)
103 | return fig, axs
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------