├── INSTALL.md ├── LICENSE ├── README.md ├── USAGE.md ├── __init__.py ├── cocoapi_hq ├── .travis.yml ├── LuaAPI │ ├── CocoApi.lua │ ├── MaskApi.lua │ ├── cocoDemo.lua │ ├── env.lua │ ├── init.lua │ └── rocks │ │ └── coco-scm-1.rockspec ├── MatlabAPI │ ├── CocoApi.m │ ├── CocoEval.m │ ├── CocoUtils.m │ ├── MaskApi.m │ ├── cocoDemo.m │ ├── evalDemo.m │ ├── gason.m │ └── private │ │ ├── gasonMex.cpp │ │ ├── gasonMex.mexa64 │ │ ├── gasonMex.mexmaci64 │ │ ├── getPrmDflt.m │ │ └── maskApiMex.c ├── PythonAPI │ ├── Makefile │ ├── pycocoDemo.ipynb │ ├── pycocoEvalDemo.ipynb │ ├── pycocotools.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ ├── pycocotools │ │ ├── __init__.py │ │ ├── _mask.c │ │ ├── _mask.pyx │ │ ├── boundary_utils.py │ │ ├── coco.py │ │ ├── cocoeval.py │ │ ├── mask.py │ │ ├── ytvos.py │ │ └── ytvoseval.py │ └── setup.py ├── README.md ├── README.txt ├── common │ ├── gason.cpp │ ├── gason.h │ ├── maskApi.c │ └── maskApi.h ├── license.txt └── results │ ├── captions_val2014_fakecap_results.json │ ├── instances_val2014_fakebbox100_results.json │ ├── instances_val2014_fakesegm100_results.json │ ├── person_keypoints_val2014_fakekeypoints100_results.json │ └── val2014_fake_eval_res.txt ├── datasets ├── __init__.py ├── coco.py ├── coco2seq.py ├── coco_eval.py ├── coco_panoptic.py ├── concat_dataset.py ├── data_prefetcher.py ├── image_to_seq_augmenter.py ├── panoptic_eval.py ├── samplers.py ├── torchvision_datasets │ ├── __init__.py │ └── coco.py ├── transforms.py ├── transforms_clip.py └── ytvos.py ├── eval_hqvis.py ├── figures ├── data1_new.gif ├── dataset_compare_s.png ├── result_demo1.gif └── vmt_banner_img.png ├── models ├── __init__.py ├── backbone.py ├── deformable_transformer.py ├── matcher.py ├── ops │ ├── MultiScaleDeformableAttention.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn_cpu.h │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_im2col_cuda.cuh │ │ ├── ms_deform_attn.h │ │ └── vision.cpp │ └── test.py ├── position_encoding.py ├── segmentation.py ├── swin_transformer.py ├── vmt.py └── x101_64d.py ├── models_swin ├── __init__.py ├── backbone.py ├── deformable_transformer.py ├── matcher.py ├── ops │ ├── MultiScaleDeformableAttention.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ └── top_level.txt │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn_cpu.h │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_im2col_cuda.cuh │ │ ├── ms_deform_attn.h │ │ └── vision.cpp │ └── test.py ├── position_encoding.py ├── segmentation.py ├── swin_transformer.py ├── vmt.py └── x101_64d.py ├── requirements.txt ├── scripts ├── eval_r101_test.sh ├── eval_r101_val.sh ├── eval_r50_test.sh ├── eval_r50_val.sh ├── eval_swin_test.sh ├── eval_swin_val.sh └── eval_swin_val_vis.sh ├── tools ├── __init__.py ├── inference.py ├── inference_swin.py ├── inference_swin_test.py ├── inference_swin_with_vis.py ├── inference_test.py ├── inference_with_vis.py └── visualizer.py └── util ├── __init__.py ├── box_ops.py ├── misc.py └── plot_utils.py /INSTALL.md: -------------------------------------------------------------------------------- 1 | ### Installation 2 | 3 | First, clone the repository locally: 4 | 5 | ```bash 6 | conda create -n vmt python=3.7 -y 7 | 8 | conda activate vmt 9 | 10 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 -c pytorch 11 | 12 | git clone --recursive https://github.com/SysCV/vmt.git 13 | ``` 14 | 15 | Install detectron2 for visualization under your working directory: 16 | ``` 17 | git clone https://github.com/facebookresearch/detectron2.git 18 | cd detectron2 19 | pip install -e . 20 | ``` 21 | 22 | Install dependencies and pycocotools for VIS and HQ-YTVIS: 23 | ```bash 24 | pip install -r requirements.txt 25 | 26 | cd cocoapi_hq/PythonAPI 27 | # To compile and install locally 28 | python setup.py build_ext --inplace 29 | # To install library to Python site-packages 30 | python setup.py build_ext install 31 | ``` 32 | 33 | Compiling CUDA operators: 34 | 35 | ```bash 36 | cd ./models/ops 37 | sh ./make.sh 38 | # unit test (should see all checking is True) 39 | python test.py 40 | 41 | cd ./models_swin/ops 42 | sh ./make.sh 43 | ``` 44 | 45 | ### Data Preparation 46 | 47 | Download and extract 2019 version of YoutubeVIS train and val images with annotations from [YouTubeVIS](https://youtube-vos.org/dataset/vis/), and download [HQ-YTVIS annotations](https://www.vis.xyz/data/hqvis/) and COCO 2017 datasets. We expect the directory structure to be the following: 48 | 49 | 50 | ``` 51 | vmt 52 | ├── datasets 53 | │ ├── coco_keepfor_ytvis19_new.json 54 | ... 55 | ytvis 56 | ├── train 57 | ├── val 58 | ├── annotations 59 | │ ├── instances_train_sub.json 60 | │ ├── instances_val_sub.json 61 | │ ├── ytvis_hq-train.json 62 | │ ├── ytvis_hq-val.json 63 | │ ├── ytvis_hq-test.json 64 | coco 65 | ├── train2017 66 | ├── val2017 67 | ├── annotations 68 | │ ├── instances_train2017.json 69 | │ ├── instances_val2017.json 70 | ``` 71 | 72 | The modified coco annotations 'coco_keepfor_ytvis19_new.json' for joint training can be downloaded from [[google]](https://drive.google.com/file/d/18yKpc8wt7xJK26QFpR5Xa0vjM5HN6ieg/view?usp=sharing). The HQ-YTVIS annotations can be downloaded from [[google]](https://drive.google.com/drive/folders/1ZU8_qO8HnJ_-vvxIAn8-_kJ4xtOdkefh?usp=sharing). 73 | 74 | ## 75 | 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video Mask Transfiner 2 | Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022] 3 | 4 | [[Project Page](https://www.vis.xyz/pub/vmt/) | [Dataset Page](https://www.vis.xyz/data/hqvis/) | [Paper](https://arxiv.org/abs/2207.14012)\] 5 | 6 | > [**Video Mask Transfiner for High-Quality Video Instance Segmentation**](http://arxiv.org/abs/2207.14012), 7 | > Lei Ke, Henghui Ding, Martin Danelljan, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu 8 | > *ECCV 2022 ([arXiv 2207.14012](https://arxiv.org/abs/2207.14012))* 9 | 10 |

11 | 12 | ## HQ-YTVIS: High-Quality Video Instance Segmentation Dataset 13 | Mask annotation comparison between **Youtube-VIS** and **HQ-YTVIS**. HQ-YTVIS serves as a new benchmark to facilitate future development (training & evaluation) of VIS methods aiming at higher mask quality. 14 | 15 | 16 | 17 | https://user-images.githubusercontent.com/17427852/181796696-bfe9a9dd-2d39-42a2-b218-283c210e5ffd.mp4 18 | 19 | Mask annotations in **Youtube-VIS** (Left Video) vs. Mask annotations in **HQ-YTVIS** (Right Video). Please visit our [Dataset Page](https://www.vis.xyz/data/hqvis/) for detailed descriptions of using HQ-YTVIS benchmark. 20 | 21 | **Dataset Download:** [HQ-YTVIS Annotation Link](https://drive.google.com/drive/folders/1ZU8_qO8HnJ_-vvxIAn8-_kJ4xtOdkefh?usp=sharing)\ 22 | **Dataset Usage:** replace our annotation json to original YTVIS annotation files. 23 | 24 | ## HQ-YTVIS Evaluation API 25 | Please refer to our [Installation Guidance](cocoapi_hq/) and [Tube-Mask AP & Tube-Boundary AP Usage Example](eval_hqvis.py). 26 | 27 | ``` 28 | python eval_hqvis.py --save-path prediction_results.json 29 | ``` 30 | 31 | ## VMT Code (under construction) 32 | 33 | ### Install 34 | 35 | Please refer to [INSTALL.md](INSTALL.md) for installation instructions and dataset preparation. 36 | 37 | ### Usages 38 | 39 | Please refer to [USAGE.md](USAGE.md) for dataset preparation and detailed running (including testing, visualization, etc.) instructions. 40 | 41 | https://user-images.githubusercontent.com/17427852/181796768-3e79ee74-2465-4af8-ba89-b5c837098e00.mp4 42 | 43 | ### Model zoo on HQ-YTVIS model 44 | 45 | Train on [HQ-YTVIS](https://www.vis.xyz/data/hqvis/) **train** set and COCO, evaluate on [HQ-YTVIS](https://www.vis.xyz/data/hqvis/) **test** set. 46 | 47 | APB: Tube-Boundary AP (proposed in Eq.1 of the paper) 48 | 49 | APM: Tube-Mask AP (proposed in YTVIS paper) 50 | 51 | | Model | APB | APB75 | ARB1 | APM | ARM75 | download | 52 | | ------------------------------------------------------------ | ---- | ---- | ---- | ---- | ---- | ------------------------------------------------------------ | 53 | | VMT_r50 | 30.7 | 24.2 | 31.5 | 50.5 | 54.5 | [weight](https://drive.google.com/file/d/1e9hKCC-pAGB-wSO0_qyUNoEe-5XzRocz/view?usp=sharing) | 54 | | VMT_r101 | 33.0 | 29.3 | 33.3 | 51.6 | 55.8 | [weight](https://drive.google.com/file/d/1TQs_meDaomLz56xCjAZKT1BNtS3K3sla/view?usp=sharing) | 55 | | VMT_swin_L | 44.8 | 43.4 | 43.0 | 64.8 | 70.1 | [weight](https://drive.google.com/file/d/13cDni9olYd6-xdURQMWstsW0VLbkgIKt/view?usp=sharing) | 56 | 57 | ### Citation 58 | 59 | ```bibtex 60 | @inproceedings{vmt, 61 | title = {Video Mask Transfiner for High-Quality Video Instance Segmentation}, 62 | author = {Ke, Lei and Ding, Henghui and Danelljan, Martin and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher}, 63 | booktitle = {European Conference on Computer Vision (ECCV)}, 64 | year = {2022} 65 | } 66 | 67 | @inproceedings{transfiner, 68 | title={Mask Transfiner for High-Quality Instance Segmentation}, 69 | author={Ke, Lei and Danelljan, Martin and Li, Xia and Tai, Yu-Wing and Tang, Chi-Keung and Yu, Fisher}, 70 | booktitle = {CVPR}, 71 | year = {2022} 72 | } 73 | ``` 74 | 75 | ## Acknowledgement 76 | We thank [Mask Transfiner](https://github.com/SysCV/transfiner) and [SeqFormer](https://github.com/wjf5203/SeqFormer) for their open source codes. 77 | -------------------------------------------------------------------------------- /USAGE.md: -------------------------------------------------------------------------------- 1 | ### Pretrained Models 2 | 3 | Download the pretrained models from the Model zoo table: 4 | ``` 5 | mkdir pretrained_model 6 | #And put the downloaded pretrained models in this directory. 7 | ``` 8 | 9 | ### Inference & Evaluation on HQ-YTVIS 10 | 11 | Refer to our [scripts folder](./scripts) for more commands: 12 | 13 | Evaluating on HQ-YTVIS test: 14 | ``` 15 | bash scripts/eval_swin_test.sh 16 | ``` 17 | or 18 | ``` 19 | bash scripts/eval_r101_test.sh 20 | ``` 21 | 22 | ### Results Visualization 23 | 24 | ``` 25 | bash scripts/eval_swin_val_vis.sh 26 | ``` 27 | or 28 | ``` 29 | python3 -m tools.inference_swin_with_vis --masks --backbone swin_l_p4w12 --output vis_output_swin_vmt --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json --save-frames True 30 | ``` 31 | 32 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/__init__.py -------------------------------------------------------------------------------- /cocoapi_hq/.travis.yml: -------------------------------------------------------------------------------- 1 | group: travis_latest 2 | language: python 3 | cache: pip 4 | python: 5 | - 2.7 6 | - 3.6 7 | install: 8 | - pip install --upgrade pip 9 | - pip install pycocotools 10 | script: 11 | - true 12 | -------------------------------------------------------------------------------- /cocoapi_hq/LuaAPI/cocoDemo.lua: -------------------------------------------------------------------------------- 1 | -- Demo for the CocoApi (see CocoApi.lua) 2 | coco = require 'coco' 3 | image = require 'image' 4 | 5 | -- initialize COCO api (please specify dataType/annType below) 6 | annTypes = { 'instances', 'captions', 'person_keypoints' } 7 | dataType, annType = 'val2014', annTypes[1]; -- specify dataType/annType 8 | annFile = '../annotations/'..annType..'_'..dataType..'.json' 9 | cocoApi=coco.CocoApi(annFile) 10 | 11 | -- get all image ids, select one at random 12 | imgIds = cocoApi:getImgIds() 13 | imgId = imgIds[torch.random(imgIds:numel())] 14 | 15 | -- load image 16 | img = cocoApi:loadImgs(imgId)[1] 17 | I = image.load('../images/'..dataType..'/'..img.file_name,3) 18 | 19 | -- load and display instance annotations 20 | annIds = cocoApi:getAnnIds({imgId=imgId}) 21 | anns = cocoApi:loadAnns(annIds) 22 | J = cocoApi:showAnns(I,anns) 23 | image.save('RES_'..img.file_name,J:double()) 24 | -------------------------------------------------------------------------------- /cocoapi_hq/LuaAPI/env.lua: -------------------------------------------------------------------------------- 1 | --[[---------------------------------------------------------------------------- 2 | 3 | Common Objects in COntext (COCO) Toolbox. version 3.0 4 | Data, paper, and tutorials available at: http://mscoco.org/ 5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016. 6 | Licensed under the Simplified BSD License [see coco/license.txt] 7 | 8 | ------------------------------------------------------------------------------]] 9 | 10 | local coco = {} 11 | return coco 12 | -------------------------------------------------------------------------------- /cocoapi_hq/LuaAPI/init.lua: -------------------------------------------------------------------------------- 1 | --[[---------------------------------------------------------------------------- 2 | 3 | Common Objects in COntext (COCO) Toolbox. version 3.0 4 | Data, paper, and tutorials available at: http://mscoco.org/ 5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016. 6 | Licensed under the Simplified BSD License [see coco/license.txt] 7 | 8 | ------------------------------------------------------------------------------]] 9 | 10 | local coco = require 'coco.env' 11 | require 'coco.CocoApi' 12 | require 'coco.MaskApi' 13 | return coco 14 | -------------------------------------------------------------------------------- /cocoapi_hq/LuaAPI/rocks/coco-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "coco" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/pdollar/coco.git" 6 | } 7 | 8 | description = { 9 | summary = "Interface for accessing the Microsoft COCO dataset", 10 | detailed = "See http://mscoco.org/ for more details", 11 | homepage = "https://github.com/pdollar/coco", 12 | license = "Simplified BSD" 13 | } 14 | 15 | dependencies = { 16 | "lua >= 5.1", 17 | "torch >= 7.0", 18 | "lua-cjson" 19 | } 20 | 21 | build = { 22 | type = "builtin", 23 | modules = { 24 | ["coco.env"] = "LuaAPI/env.lua", 25 | ["coco.init"] = "LuaAPI/init.lua", 26 | ["coco.MaskApi"] = "LuaAPI/MaskApi.lua", 27 | ["coco.CocoApi"] = "LuaAPI/CocoApi.lua", 28 | libmaskapi = { 29 | sources = { "common/maskApi.c" }, 30 | incdirs = { "common/" } 31 | } 32 | } 33 | } 34 | 35 | -- luarocks make LuaAPI/rocks/coco-scm-1.rockspec 36 | -- https://github.com/pdollar/coco/raw/master/LuaAPI/rocks/coco-scm-1.rockspec 37 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/MaskApi.m: -------------------------------------------------------------------------------- 1 | classdef MaskApi 2 | % Interface for manipulating masks stored in RLE format. 3 | % 4 | % RLE is a simple yet efficient format for storing binary masks. RLE 5 | % first divides a vector (or vectorized image) into a series of piecewise 6 | % constant regions and then for each piece simply stores the length of 7 | % that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 8 | % be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 9 | % (note that the odd counts are always the numbers of zeros). Instead of 10 | % storing the counts directly, additional compression is achieved with a 11 | % variable bitrate representation based on a common scheme called LEB128. 12 | % 13 | % Compression is greatest given large piecewise constant regions. 14 | % Specifically, the size of the RLE is proportional to the number of 15 | % *boundaries* in M (or for an image the number of boundaries in the y 16 | % direction). Assuming fairly simple shapes, the RLE representation is 17 | % O(sqrt(n)) where n is number of pixels in the object. Hence space usage 18 | % is substantially lower, especially for large simple objects (large n). 19 | % 20 | % Many common operations on masks can be computed directly using the RLE 21 | % (without need for decoding). This includes computations such as area, 22 | % union, intersection, etc. All of these operations are linear in the 23 | % size of the RLE, in other words they are O(sqrt(n)) where n is the area 24 | % of the object. Computing these operations on the original mask is O(n). 25 | % Thus, using the RLE can result in substantial computational savings. 26 | % 27 | % The following API functions are defined: 28 | % encode - Encode binary masks using RLE. 29 | % decode - Decode binary masks encoded via RLE. 30 | % merge - Compute union or intersection of encoded masks. 31 | % iou - Compute intersection over union between masks. 32 | % nms - Compute non-maximum suppression between ordered masks. 33 | % area - Compute area of encoded masks. 34 | % toBbox - Get bounding boxes surrounding encoded masks. 35 | % frBbox - Convert bounding boxes to encoded masks. 36 | % frPoly - Convert polygon to encoded mask. 37 | % 38 | % Usage: 39 | % Rs = MaskApi.encode( masks ) 40 | % masks = MaskApi.decode( Rs ) 41 | % R = MaskApi.merge( Rs, [intersect=false] ) 42 | % o = MaskApi.iou( dt, gt, [iscrowd=false] ) 43 | % keep = MaskApi.nms( dt, thr ) 44 | % a = MaskApi.area( Rs ) 45 | % bbs = MaskApi.toBbox( Rs ) 46 | % Rs = MaskApi.frBbox( bbs, h, w ) 47 | % R = MaskApi.frPoly( poly, h, w ) 48 | % 49 | % In the API the following formats are used: 50 | % R,Rs - [struct] Run-length encoding of binary mask(s) 51 | % masks - [hxwxn] Binary mask(s) (must have type uint8) 52 | % bbs - [nx4] Bounding box(es) stored as [x y w h] 53 | % poly - Polygon stored as {[x1 y1 x2 y2...],[x1 y1 ...],...} 54 | % dt,gt - May be either bounding boxes or encoded masks 55 | % Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 56 | % 57 | % Finally, a note about the intersection over union (iou) computation. 58 | % The standard iou of a ground truth (gt) and detected (dt) object is 59 | % iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 60 | % For "crowd" regions, we use a modified criteria. If a gt object is 61 | % marked as "iscrowd", we allow a dt to match any subregion of the gt. 62 | % Choosing gt' in the crowd gt that best matches the dt can be done using 63 | % gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 64 | % iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 65 | % For crowd gt regions we use this modified criteria above for the iou. 66 | % 67 | % To compile use the following (some precompiled binaries are included): 68 | % mex('CFLAGS=\$CFLAGS -Wall -std=c99','-largeArrayDims',... 69 | % 'private/maskApiMex.c','../common/maskApi.c',... 70 | % '-I../common/','-outdir','private'); 71 | % Please do not contact us for help with compiling. 72 | % 73 | % Microsoft COCO Toolbox. version 2.0 74 | % Data, paper, and tutorials available at: http://mscoco.org/ 75 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 76 | % Licensed under the Simplified BSD License [see coco/license.txt] 77 | 78 | methods( Static ) 79 | function Rs = encode( masks ) 80 | Rs = maskApiMex( 'encode', masks ); 81 | end 82 | 83 | function masks = decode( Rs ) 84 | masks = maskApiMex( 'decode', Rs ); 85 | end 86 | 87 | function R = merge( Rs, varargin ) 88 | R = maskApiMex( 'merge', Rs, varargin{:} ); 89 | end 90 | 91 | function o = iou( dt, gt, varargin ) 92 | o = maskApiMex( 'iou', dt', gt', varargin{:} ); 93 | end 94 | 95 | function keep = nms( dt, thr ) 96 | keep = maskApiMex('nms',dt',thr); 97 | end 98 | 99 | function a = area( Rs ) 100 | a = maskApiMex( 'area', Rs ); 101 | end 102 | 103 | function bbs = toBbox( Rs ) 104 | bbs = maskApiMex( 'toBbox', Rs )'; 105 | end 106 | 107 | function Rs = frBbox( bbs, h, w ) 108 | Rs = maskApiMex( 'frBbox', bbs', h, w ); 109 | end 110 | 111 | function R = frPoly( poly, h, w ) 112 | R = maskApiMex( 'frPoly', poly, h , w ); 113 | end 114 | end 115 | 116 | end 117 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/cocoDemo.m: -------------------------------------------------------------------------------- 1 | %% Demo for the CocoApi (see CocoApi.m) 2 | 3 | %% initialize COCO api (please specify dataType/annType below) 4 | annTypes = { 'instances', 'captions', 'person_keypoints' }; 5 | dataType='val2014'; annType=annTypes{1}; % specify dataType/annType 6 | annFile=sprintf('../annotations/%s_%s.json',annType,dataType); 7 | coco=CocoApi(annFile); 8 | 9 | %% display COCO categories and supercategories 10 | if( ~strcmp(annType,'captions') ) 11 | cats = coco.loadCats(coco.getCatIds()); 12 | nms={cats.name}; fprintf('COCO categories: '); 13 | fprintf('%s, ',nms{:}); fprintf('\n'); 14 | nms=unique({cats.supercategory}); fprintf('COCO supercategories: '); 15 | fprintf('%s, ',nms{:}); fprintf('\n'); 16 | end 17 | 18 | %% get all images containing given categories, select one at random 19 | catIds = coco.getCatIds('catNms',{'person','dog','skateboard'}); 20 | imgIds = coco.getImgIds('catIds',catIds); 21 | imgId = imgIds(randi(length(imgIds))); 22 | 23 | %% load and display image 24 | img = coco.loadImgs(imgId); 25 | I = imread(sprintf('../images/%s/%s',dataType,img.file_name)); 26 | figure(1); imagesc(I); axis('image'); set(gca,'XTick',[],'YTick',[]) 27 | 28 | %% load and display annotations 29 | annIds = coco.getAnnIds('imgIds',imgId,'catIds',catIds,'iscrowd',[]); 30 | anns = coco.loadAnns(annIds); coco.showAnns(anns); 31 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/evalDemo.m: -------------------------------------------------------------------------------- 1 | %% Demo demonstrating the algorithm result formats for COCO 2 | 3 | %% select results type for demo (either bbox or segm) 4 | type = {'segm','bbox','keypoints'}; type = type{1}; % specify type here 5 | fprintf('Running demo for *%s* results.\n\n',type); 6 | 7 | %% initialize COCO ground truth api 8 | dataDir='../'; prefix='instances'; dataType='val2014'; 9 | if(strcmp(type,'keypoints')), prefix='person_keypoints'; end 10 | annFile=sprintf('%s/annotations/%s_%s.json',dataDir,prefix,dataType); 11 | cocoGt=CocoApi(annFile); 12 | 13 | %% initialize COCO detections api 14 | resFile='%s/results/%s_%s_fake%s100_results.json'; 15 | resFile=sprintf(resFile,dataDir,prefix,dataType,type); 16 | cocoDt=cocoGt.loadRes(resFile); 17 | 18 | %% visialuze gt and dt side by side 19 | imgIds=sort(cocoGt.getImgIds()); imgIds=imgIds(1:100); 20 | imgId = imgIds(randi(100)); img = cocoGt.loadImgs(imgId); 21 | I = imread(sprintf('%s/images/val2014/%s',dataDir,img.file_name)); 22 | figure(1); subplot(1,2,1); imagesc(I); axis('image'); axis off; 23 | annIds = cocoGt.getAnnIds('imgIds',imgId); title('ground truth') 24 | anns = cocoGt.loadAnns(annIds); cocoGt.showAnns(anns); 25 | figure(1); subplot(1,2,2); imagesc(I); axis('image'); axis off; 26 | annIds = cocoDt.getAnnIds('imgIds',imgId); title('results') 27 | anns = cocoDt.loadAnns(annIds); cocoDt.showAnns(anns); 28 | 29 | %% load raw JSON and show exact format for results 30 | fprintf('results structure have the following format:\n'); 31 | res = gason(fileread(resFile)); disp(res) 32 | 33 | %% the following command can be used to save the results back to disk 34 | if(0), f=fopen(resFile,'w'); fwrite(f,gason(res)); fclose(f); end 35 | 36 | %% run COCO evaluation code (see CocoEval.m) 37 | cocoEval=CocoEval(cocoGt,cocoDt,type); 38 | cocoEval.params.imgIds=imgIds; 39 | cocoEval.evaluate(); 40 | cocoEval.accumulate(); 41 | cocoEval.summarize(); 42 | 43 | %% generate Derek Hoiem style analyis of false positives (slow) 44 | if(0), cocoEval.analyze(); end 45 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/gason.m: -------------------------------------------------------------------------------- 1 | function out = gason( in ) 2 | % Convert between JSON strings and corresponding JSON objects. 3 | % 4 | % This parser is based on Gason written and maintained by Ivan Vashchaev: 5 | % https://github.com/vivkin/gason 6 | % Gason is a "lightweight and fast JSON parser for C++". Please see the 7 | % above link for license information and additional details about Gason. 8 | % 9 | % Given a JSON string, gason calls the C++ parser and converts the output 10 | % into an appropriate Matlab structure. As the parsing is performed in mex 11 | % the resulting parser is blazingly fast. Large JSON structs (100MB+) take 12 | % only a few seconds to parse (compared to hours for pure Matlab parsers). 13 | % 14 | % Given a JSON object, gason calls the C++ encoder to convert the object 15 | % back into a JSON string representation. Nearly any Matlab struct, cell 16 | % array, or numeric array represent a valid JSON object. Note that gason() 17 | % can be used to go both from JSON string to JSON object and back. 18 | % 19 | % Gason requires C++11 to compile (for GCC this requires version 4.7 or 20 | % later). The following command compiles the parser (may require tweaking): 21 | % mex('CXXFLAGS=\$CXXFLAGS -std=c++11 -Wall','-largeArrayDims',... 22 | % 'private/gasonMex.cpp','../common/gason.cpp',... 23 | % '-I../common/','-outdir','private'); 24 | % Note the use of the "-std=c++11" flag. A number of precompiled binaries 25 | % are included, please do not contact us for help with compiling. If needed 26 | % you can specify a compiler by adding the option 'CXX="/usr/bin/g++"'. 27 | % 28 | % Note that by default JSON arrays that contain only numbers are stored as 29 | % regular Matlab arrays. Likewise, JSON arrays that contain only objects of 30 | % the same type are stored as Matlab struct arrays. This is much faster and 31 | % can use considerably less memory than always using Matlab cell arrays. 32 | % 33 | % USAGE 34 | % object = gason( string ) 35 | % string = gason( object ) 36 | % 37 | % INPUTS/OUTPUTS 38 | % string - JSON string 39 | % object - JSON object 40 | % 41 | % EXAMPLE 42 | % o = struct('first',{'piotr','ty'},'last',{'dollar','lin'}) 43 | % s = gason( o ) % convert JSON object -> JSON string 44 | % p = gason( s ) % convert JSON string -> JSON object 45 | % 46 | % See also 47 | % 48 | % Microsoft COCO Toolbox. version 2.0 49 | % Data, paper, and tutorials available at: http://mscoco.org/ 50 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 51 | % Licensed under the Simplified BSD License [see coco/license.txt] 52 | 53 | out = gasonMex( 'convert', in ); 54 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/private/gasonMex.mexa64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/cocoapi_hq/MatlabAPI/private/gasonMex.mexa64 -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/private/gasonMex.mexmaci64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/cocoapi_hq/MatlabAPI/private/gasonMex.mexmaci64 -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/private/getPrmDflt.m: -------------------------------------------------------------------------------- 1 | function varargout = getPrmDflt( prm, dfs, checkExtra ) 2 | % Helper to set default values (if not already set) of parameter struct. 3 | % 4 | % Takes input parameters and a list of 'name'/default pairs, and for each 5 | % 'name' for which prm has no value (prm.(name) is not a field or 'name' 6 | % does not appear in prm list), getPrmDflt assigns the given default 7 | % value. If default value for variable 'name' is 'REQ', and value for 8 | % 'name' is not given, an error is thrown. See below for usage details. 9 | % 10 | % USAGE (nargout==1) 11 | % prm = getPrmDflt( prm, dfs, [checkExtra] ) 12 | % 13 | % USAGE (nargout>1) 14 | % [ param1 ... paramN ] = getPrmDflt( prm, dfs, [checkExtra] ) 15 | % 16 | % INPUTS 17 | % prm - param struct or cell of form {'name1' v1 'name2' v2 ...} 18 | % dfs - cell of form {'name1' def1 'name2' def2 ...} 19 | % checkExtra - [0] if 1 throw error if prm contains params not in dfs 20 | % if -1 if prm contains params not in dfs adds them 21 | % 22 | % OUTPUTS (nargout==1) 23 | % prm - parameter struct with fields 'name1' through 'nameN' assigned 24 | % 25 | % OUTPUTS (nargout>1) 26 | % param1 - value assigned to parameter with 'name1' 27 | % ... 28 | % paramN - value assigned to parameter with 'nameN' 29 | % 30 | % EXAMPLE 31 | % dfs = { 'x','REQ', 'y',0, 'z',[], 'eps',1e-3 }; 32 | % prm = getPrmDflt( struct('x',1,'y',1), dfs ) 33 | % [ x y z eps ] = getPrmDflt( {'x',2,'y',1}, dfs ) 34 | % 35 | % See also INPUTPARSER 36 | % 37 | % Piotr's Computer Vision Matlab Toolbox Version 2.60 38 | % Copyright 2014 Piotr Dollar. [pdollar-at-gmail.com] 39 | % Licensed under the Simplified BSD License [see external/bsd.txt] 40 | 41 | if( mod(length(dfs),2) ), error('odd number of default parameters'); end 42 | if nargin<=2, checkExtra = 0; end 43 | 44 | % get the input parameters as two cell arrays: prmVal and prmField 45 | if iscell(prm) && length(prm)==1, prm=prm{1}; end 46 | if iscell(prm) 47 | if(mod(length(prm),2)), error('odd number of parameters in prm'); end 48 | prmField = prm(1:2:end); prmVal = prm(2:2:end); 49 | else 50 | if(~isstruct(prm)), error('prm must be a struct or a cell'); end 51 | prmVal = struct2cell(prm); prmField = fieldnames(prm); 52 | end 53 | 54 | % get and update default values using quick for loop 55 | dfsField = dfs(1:2:end); dfsVal = dfs(2:2:end); 56 | if checkExtra>0 57 | for i=1:length(prmField) 58 | j = find(strcmp(prmField{i},dfsField)); 59 | if isempty(j), error('parameter %s is not valid', prmField{i}); end 60 | dfsVal(j) = prmVal(i); 61 | end 62 | elseif checkExtra<0 63 | for i=1:length(prmField) 64 | j = find(strcmp(prmField{i},dfsField)); 65 | if isempty(j), j=length(dfsVal)+1; dfsField{j}=prmField{i}; end 66 | dfsVal(j) = prmVal(i); 67 | end 68 | else 69 | for i=1:length(prmField) 70 | dfsVal(strcmp(prmField{i},dfsField)) = prmVal(i); 71 | end 72 | end 73 | 74 | % check for missing values 75 | if any(strcmp('REQ',dfsVal)) 76 | cmpArray = find(strcmp('REQ',dfsVal)); 77 | error(['Required field ''' dfsField{cmpArray(1)} ''' not specified.'] ); 78 | end 79 | 80 | % set output 81 | if nargout==1 82 | varargout{1} = cell2struct( dfsVal, dfsField, 2 ); 83 | else 84 | varargout = dfsVal; 85 | end 86 | -------------------------------------------------------------------------------- /cocoapi_hq/MatlabAPI/private/maskApiMex.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "mex.h" 8 | #include "maskApi.h" 9 | #include 10 | 11 | void checkType( const mxArray *M, mxClassID id ) { 12 | if(mxGetClassID(M)!=id) mexErrMsgTxt("Invalid type."); 13 | } 14 | 15 | mxArray* toMxArray( const RLE *R, siz n ) { 16 | const char *fs[] = {"size", "counts"}; 17 | mxArray *M=mxCreateStructMatrix(1,n,2,fs); 18 | for( siz i=0; i1) mexErrMsgTxt(err); 35 | for( i=0; i<*n; i++ ) { 36 | mxArray *S, *C; double *s; void *c; 37 | S=mxGetFieldByNumber(M,i,O[0]); checkType(S,mxDOUBLE_CLASS); 38 | C=mxGetFieldByNumber(M,i,O[1]); s=mxGetPr(S); c=mxGetData(C); 39 | h=(siz)s[0]; w=(siz)s[1]; m=mxGetNumberOfElements(C); 40 | if(same && i>0 && (h!=R[0].h || w!=R[0].w)) mexErrMsgTxt(err); 41 | if( mxGetClassID(C)==mxDOUBLE_CLASS ) { 42 | rleInit(R+i,h,w,m,0); 43 | for(j=0; j=2) ? (mxGetScalar(pr[1])>0) : false; 74 | rleMerge(R,&M,n,intersect); pl[0]=toMxArray(&M,1); rleFree(&M); 75 | 76 | } else if(!strcmp(action,"area")) { 77 | R=frMxArray(pr[0],&n,0); 78 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 79 | uint *a=(uint*) mxGetPr(pl[0]); rleArea(R,n,a); 80 | 81 | } else if(!strcmp(action,"iou")) { 82 | if(nr>2) checkType(pr[2],mxUINT8_CLASS); siz nDt, nGt; 83 | byte *iscrowd = nr>2 ? (byte*) mxGetPr(pr[2]) : NULL; 84 | if(mxIsStruct(pr[0]) || mxIsStruct(pr[1])) { 85 | RLE *dt=frMxArray(pr[0],&nDt,1), *gt=frMxArray(pr[1],&nGt,1); 86 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL); 87 | double *o=mxGetPr(pl[0]); rleIou(dt,gt,nDt,nGt,iscrowd,o); 88 | rlesFree(&dt,nDt); rlesFree(>,nGt); 89 | } else { 90 | checkType(pr[0],mxDOUBLE_CLASS); checkType(pr[1],mxDOUBLE_CLASS); 91 | double *dt=mxGetPr(pr[0]); nDt=mxGetN(pr[0]); 92 | double *gt=mxGetPr(pr[1]); nGt=mxGetN(pr[1]); 93 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL); 94 | double *o=mxGetPr(pl[0]); bbIou(dt,gt,nDt,nGt,iscrowd,o); 95 | } 96 | 97 | } else if(!strcmp(action,"nms")) { 98 | siz n; uint *keep; double thr=(double) mxGetScalar(pr[1]); 99 | if(mxIsStruct(pr[0])) { 100 | RLE *dt=frMxArray(pr[0],&n,1); 101 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 102 | keep=(uint*) mxGetPr(pl[0]); rleNms(dt,n,keep,thr); 103 | rlesFree(&dt,n); 104 | } else { 105 | checkType(pr[0],mxDOUBLE_CLASS); 106 | double *dt=mxGetPr(pr[0]); n=mxGetN(pr[0]); 107 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 108 | keep=(uint*) mxGetPr(pl[0]); bbNms(dt,n,keep,thr); 109 | } 110 | 111 | } else if(!strcmp(action,"toBbox")) { 112 | R=frMxArray(pr[0],&n,0); 113 | pl[0]=mxCreateNumericMatrix(4,n,mxDOUBLE_CLASS,mxREAL); 114 | BB bb=mxGetPr(pl[0]); rleToBbox(R,bb,n); 115 | 116 | } else if(!strcmp(action,"frBbox")) { 117 | checkType(pr[0],mxDOUBLE_CLASS); 118 | double *bb=mxGetPr(pr[0]); n=mxGetN(pr[0]); 119 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]); 120 | rlesInit(&R,n); rleFrBbox(R,bb,h,w,n); pl[0]=toMxArray(R,n); 121 | 122 | } else if(!strcmp(action,"frPoly")) { 123 | checkType(pr[0],mxCELL_CLASS); n=mxGetNumberOfElements(pr[0]); 124 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]); rlesInit(&R,n); 125 | for(siz i=0; i=18.0 2 | cython>=0.27.3 3 | matplotlib>=2.1.0 4 | -------------------------------------------------------------------------------- /cocoapi_hq/PythonAPI/pycocotools.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | -------------------------------------------------------------------------------- /cocoapi_hq/PythonAPI/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /cocoapi_hq/PythonAPI/pycocotools/boundary_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import multiprocessing 3 | import math 4 | 5 | import cv2 6 | import numpy as np 7 | 8 | import pycocotools.mask as mask_utils 9 | 10 | 11 | # General util function to get the boundary of a binary mask. 12 | def mask_to_boundary(mask, dilation_ratio=0.02): 13 | """ 14 | Convert binary mask to boundary mask. 15 | :param mask (numpy array, uint8): binary mask 16 | :param dilation_ratio (float): ratio to calculate dilation = dilation_ratio * image_diagonal 17 | :return: boundary mask (numpy array) 18 | """ 19 | h, w = mask.shape 20 | img_diag = np.sqrt(h ** 2 + w ** 2) 21 | dilation = int(round(dilation_ratio * img_diag)) 22 | if dilation < 1: 23 | dilation = 1 24 | # Pad image so mask truncated by the image border is also considered as boundary. 25 | new_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0) 26 | kernel = np.ones((3, 3), dtype=np.uint8) 27 | new_mask_erode = cv2.erode(new_mask, kernel, iterations=dilation) 28 | mask_erode = new_mask_erode[1 : h + 1, 1 : w + 1] 29 | # G_d intersects G in the paper. 30 | return mask - mask_erode 31 | 32 | 33 | # COCO/LVIS related util functions, to get the boundary for every annotations. 34 | def augment_annotations_with_boundary_single_core(proc_id, annotations, ann_to_mask, dilation_ratio=0.02): 35 | new_annotations = [] 36 | 37 | for ann in annotations: 38 | mask_list = ann_to_mask(ann) 39 | # Find mask boundary. 40 | bound_list = [] 41 | for mask in mask_list: 42 | boundary = mask_to_boundary(mask, dilation_ratio) 43 | bound_list.append(mask_utils.encode(np.array(boundary[:, :, None], order="F", dtype="uint8"))[0]) 44 | 45 | # Add boundary to annotation in RLE format. 46 | ann['boundary_list'] = bound_list 47 | # print('ann keys:', ann.keys()) 48 | new_annotations.append(ann) 49 | 50 | return new_annotations 51 | 52 | 53 | def augment_annotations_with_boundary_multi_core(annotations, ann_to_mask, dilation_ratio=0.02): 54 | cpu_num = multiprocessing.cpu_count() 55 | annotations_split = np.array_split(annotations, cpu_num) 56 | print("Number of cores: {}, annotations per core: {}".format(cpu_num, len(annotations_split[0]))) 57 | workers = multiprocessing.Pool(processes=cpu_num) 58 | processes = [] 59 | 60 | for proc_id, annotation_set in enumerate(annotations_split): 61 | p = workers.apply_async(augment_annotations_with_boundary_single_core, 62 | (proc_id, annotation_set, ann_to_mask, dilation_ratio)) 63 | processes.append(p) 64 | 65 | new_annotations = [] 66 | for p in processes: 67 | new_annotations.extend(p.get()) 68 | 69 | workers.close() 70 | workers.join() 71 | 72 | return new_annotations -------------------------------------------------------------------------------- /cocoapi_hq/PythonAPI/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /cocoapi_hq/PythonAPI/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | import numpy as np 3 | 4 | # To compile and install locally run "python setup.py build_ext --inplace" 5 | # To install library to Python site-packages run "python setup.py build_ext install" 6 | 7 | ext_modules = [ 8 | Extension( 9 | 'pycocotools._mask', 10 | sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'], 11 | include_dirs = [np.get_include(), '../common'], 12 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 13 | ) 14 | ] 15 | 16 | setup( 17 | name='pycocotools', 18 | packages=['pycocotools'], 19 | package_dir = {'pycocotools': 'pycocotools'}, 20 | install_requires=[ 21 | 'setuptools>=18.0', 22 | 'cython>=0.27.3', 23 | 'matplotlib>=2.1.0' 24 | ], 25 | version='2.0', 26 | ext_modules= ext_modules 27 | ) 28 | -------------------------------------------------------------------------------- /cocoapi_hq/README.md: -------------------------------------------------------------------------------- 1 | # HQ-YTVIS data loading and evaluation 2 | 3 | It support both the tube-boundary AP evalution proposed in Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022], and also the traditional tube-mask AP evaluation. 4 | ## Introduction 5 | 6 | This package provides data loading and evaluation functionalities for high-quality video instance segmentation on HQ-YTVIS. It is built based on [youtubevos API](https://github.com/youtubevos/cocoapi/) designed for the Youtube VOS dataset (https://youtube-vos.org/dataset/vis/). For evaluation metrics, pleae refer to the Video Mask Transfiner for High-Quality Video Instance Segmentation [ECCV 2022]. 7 | 8 | We have only implemented Python API for HQ-YTVIS. 9 | 10 | ## Installation 11 | To install: 12 | ``` 13 | cd PythonAPI 14 | # To compile and install locally 15 | python setup.py build_ext --inplace 16 | # To install library to Python site-packages 17 | python setup.py build_ext install 18 | ``` 19 | 20 | ## Contact 21 | If you have any questions regarding the repo, please create an issue. 22 | -------------------------------------------------------------------------------- /cocoapi_hq/README.txt: -------------------------------------------------------------------------------- 1 | COCO API - http://cocodataset.org/ 2 | 3 | COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation. This package provides Matlab, Python, and Lua APIs that assists in loading, parsing, and visualizing the annotations in COCO. Please visit http://cocodataset.org/ for more information on COCO, including for the data, paper, and tutorials. The exact format of the annotations is also described on the COCO website. The Matlab and Python APIs are complete, the Lua API provides only basic functionality. 4 | 5 | In addition to this API, please download both the COCO images and annotations in order to run the demos and use the API. Both are available on the project website. 6 | -Please download, unzip, and place the images in: coco/images/ 7 | -Please download and place the annotations in: coco/annotations/ 8 | For substantially more details on the API please see http://cocodataset.org/#download. 9 | 10 | After downloading the images and annotations, run the Matlab, Python, or Lua demos for example usage. 11 | 12 | To install: 13 | -For Matlab, add coco/MatlabApi to the Matlab path (OSX/Linux binaries provided) 14 | -For Python, run "make" under coco/PythonAPI 15 | -For Lua, run “luarocks make LuaAPI/rocks/coco-scm-1.rockspec” under coco/ 16 | -------------------------------------------------------------------------------- /cocoapi_hq/common/gason.h: -------------------------------------------------------------------------------- 1 | // https://github.com/vivkin/gason - pulled January 10, 2016 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | enum JsonTag { 9 | JSON_NUMBER = 0, 10 | JSON_STRING, 11 | JSON_ARRAY, 12 | JSON_OBJECT, 13 | JSON_TRUE, 14 | JSON_FALSE, 15 | JSON_NULL = 0xF 16 | }; 17 | 18 | struct JsonNode; 19 | 20 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL 21 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL 22 | #define JSON_VALUE_TAG_MASK 0xF 23 | #define JSON_VALUE_TAG_SHIFT 47 24 | 25 | union JsonValue { 26 | uint64_t ival; 27 | double fval; 28 | 29 | JsonValue(double x) 30 | : fval(x) { 31 | } 32 | JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) { 33 | assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK); 34 | ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload; 35 | } 36 | bool isDouble() const { 37 | return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK; 38 | } 39 | JsonTag getTag() const { 40 | return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK); 41 | } 42 | uint64_t getPayload() const { 43 | assert(!isDouble()); 44 | return ival & JSON_VALUE_PAYLOAD_MASK; 45 | } 46 | double toNumber() const { 47 | assert(getTag() == JSON_NUMBER); 48 | return fval; 49 | } 50 | char *toString() const { 51 | assert(getTag() == JSON_STRING); 52 | return (char *)getPayload(); 53 | } 54 | JsonNode *toNode() const { 55 | assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT); 56 | return (JsonNode *)getPayload(); 57 | } 58 | }; 59 | 60 | struct JsonNode { 61 | JsonValue value; 62 | JsonNode *next; 63 | char *key; 64 | }; 65 | 66 | struct JsonIterator { 67 | JsonNode *p; 68 | 69 | void operator++() { 70 | p = p->next; 71 | } 72 | bool operator!=(const JsonIterator &x) const { 73 | return p != x.p; 74 | } 75 | JsonNode *operator*() const { 76 | return p; 77 | } 78 | JsonNode *operator->() const { 79 | return p; 80 | } 81 | }; 82 | 83 | inline JsonIterator begin(JsonValue o) { 84 | return JsonIterator{o.toNode()}; 85 | } 86 | inline JsonIterator end(JsonValue) { 87 | return JsonIterator{nullptr}; 88 | } 89 | 90 | #define JSON_ERRNO_MAP(XX) \ 91 | XX(OK, "ok") \ 92 | XX(BAD_NUMBER, "bad number") \ 93 | XX(BAD_STRING, "bad string") \ 94 | XX(BAD_IDENTIFIER, "bad identifier") \ 95 | XX(STACK_OVERFLOW, "stack overflow") \ 96 | XX(STACK_UNDERFLOW, "stack underflow") \ 97 | XX(MISMATCH_BRACKET, "mismatch bracket") \ 98 | XX(UNEXPECTED_CHARACTER, "unexpected character") \ 99 | XX(UNQUOTED_KEY, "unquoted key") \ 100 | XX(BREAKING_BAD, "breaking bad") \ 101 | XX(ALLOCATION_FAILURE, "allocation failure") 102 | 103 | enum JsonErrno { 104 | #define XX(no, str) JSON_##no, 105 | JSON_ERRNO_MAP(XX) 106 | #undef XX 107 | }; 108 | 109 | const char *jsonStrError(int err); 110 | 111 | class JsonAllocator { 112 | struct Zone { 113 | Zone *next; 114 | size_t used; 115 | } *head = nullptr; 116 | 117 | public: 118 | JsonAllocator() = default; 119 | JsonAllocator(const JsonAllocator &) = delete; 120 | JsonAllocator &operator=(const JsonAllocator &) = delete; 121 | JsonAllocator(JsonAllocator &&x) : head(x.head) { 122 | x.head = nullptr; 123 | } 124 | JsonAllocator &operator=(JsonAllocator &&x) { 125 | head = x.head; 126 | x.head = nullptr; 127 | return *this; 128 | } 129 | ~JsonAllocator() { 130 | deallocate(); 131 | } 132 | void *allocate(size_t size); 133 | void deallocate(); 134 | }; 135 | 136 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator); 137 | -------------------------------------------------------------------------------- /cocoapi_hq/common/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 174 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 175 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 176 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 177 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 178 | } else for( d=0; d<=dy; d++ ) { 179 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 180 | } 181 | } 182 | /* get points along y-boundary and downsample */ 183 | free(x); free(y); k=m; m=0; double xd, yd; 184 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 185 | for( j=1; jw-1 ) continue; 188 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 190 | x[m]=(int) xd; y[m]=(int) yd; m++; 191 | } 192 | /* compute rle encoding given y-boundary points */ 193 | k=m; a=malloc(sizeof(uint)*(k+1)); 194 | for( j=0; j0) b[m++]=a[j++]; else { 200 | j++; if(jm, p=0; long x; int more; 207 | char *s=malloc(sizeof(char)*m*6); 208 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 210 | while( more ) { 211 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 212 | if(more) c |= 0x20; c+=48; s[p++]=c; 213 | } 214 | } 215 | s[p]=0; return s; 216 | } 217 | 218 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 219 | siz m=0, p=0, k; long x; int more; uint *cnts; 220 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 221 | while( s[p] ) { 222 | x=0; k=0; more=1; 223 | while( more ) { 224 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 225 | more = c & 0x20; p++; k++; 226 | if(!more && (c & 0x10)) x |= -1 << 5*k; 227 | } 228 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 229 | } 230 | rleInit(R,h,w,m,cnts); free(cnts); 231 | } 232 | -------------------------------------------------------------------------------- /cocoapi_hq/common/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /cocoapi_hq/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /cocoapi_hq/results/val2014_fake_eval_res.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------ 2 | type=segm 3 | Running per image evaluation... DONE (t=0.45s). 4 | Accumulating evaluation results... DONE (t=0.08s). 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.320 6 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.562 7 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.299 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.387 9 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.310 10 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.268 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.415 13 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.417 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.469 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.381 17 | 18 | ------------------------------------------------------------------------------ 19 | type=bbox 20 | Running per image evaluation... DONE (t=0.34s). 21 | Accumulating evaluation results... DONE (t=0.08s). 22 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.505 23 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.697 24 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.573 25 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.586 26 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.519 27 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501 28 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.387 29 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.594 30 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.595 31 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.640 32 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.566 33 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.564 34 | 35 | ------------------------------------------------------------------------------ 36 | type=keypoints 37 | Running per image evaluation... DONE (t=0.06s). 38 | Accumulating evaluation results... DONE (t=0.00s). 39 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.372 40 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.636 41 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.348 42 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.384 43 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.386 44 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.514 45 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.734 46 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.504 47 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.508 48 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.522 49 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | from .torchvision_datasets import CocoDetection 3 | from datasets.ytvos import YTVOSDataset as YTVOSDataset 4 | 5 | from .coco import build as build_coco 6 | from .coco2seq import build as build_seq_coco 7 | from .concat_dataset import build as build_joint 8 | from .ytvos import build as build_ytvs 9 | 10 | 11 | 12 | def get_coco_api_from_dataset(dataset): 13 | for _ in range(10): 14 | if isinstance(dataset, torch.utils.data.Subset): 15 | dataset = dataset.dataset 16 | if isinstance(dataset, CocoDetection): 17 | return dataset.coco 18 | if isinstance(dataset, YTVOSDataset): 19 | return dataset.ytvos 20 | 21 | 22 | ### build_type only works for YoutubeVIS ### 23 | def build_dataset(image_set, args): 24 | if args.dataset_file == 'YoutubeVIS': 25 | return build_ytvs(image_set, args) 26 | 27 | if args.dataset_file == 'coco': 28 | return build_coco(image_set, args) 29 | if args.dataset_file == 'Seq_coco': 30 | return build_seq_coco(image_set, args) 31 | if args.dataset_file == 'jointcoco': 32 | return build_joint(image_set, args) 33 | 34 | 35 | raise ValueError(f'dataset {args.dataset_file} not supported') 36 | 37 | 38 | -------------------------------------------------------------------------------- /datasets/coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | COCO dataset which returns image_id for evaluation. 3 | 4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 5 | """ 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.utils.data 10 | from pycocotools import mask as coco_mask 11 | 12 | from .torchvision_datasets import CocoDetection as TvCocoDetection 13 | from util.misc import get_local_rank, get_local_size 14 | import datasets.transforms as T 15 | import random 16 | 17 | 18 | class CocoDetection(TvCocoDetection): 19 | def __init__(self, img_folder, ann_file, transforms, return_masks, cache_mode=False, local_rank=0, local_size=1): 20 | super(CocoDetection, self).__init__(img_folder, ann_file, 21 | cache_mode=cache_mode, local_rank=local_rank, local_size=local_size) 22 | self._transforms = transforms 23 | self.prepare = ConvertCocoPolysToMask(return_masks) 24 | 25 | def __getitem__(self, idx): 26 | 27 | instance_check = False 28 | while not instance_check: 29 | img, target = super(CocoDetection, self).__getitem__(idx) 30 | image_id = self.ids[idx] 31 | target = {'image_id': image_id, 'annotations': target} 32 | img, target = self.prepare(img, target) 33 | if self._transforms is not None: 34 | img, target = self._transforms(img, target) 35 | 36 | if len(target['labels']) == 0: # None instance 37 | idx = random.randint(0,self.__len__()-1) 38 | else: 39 | instance_check=True 40 | 41 | return img, target 42 | 43 | 44 | def convert_coco_poly_to_mask(segmentations, height, width): 45 | masks = [] 46 | for polygons in segmentations: 47 | rles = coco_mask.frPyObjects(polygons, height, width) 48 | mask = coco_mask.decode(rles) 49 | if len(mask.shape) < 3: 50 | mask = mask[..., None] 51 | mask = torch.as_tensor(mask, dtype=torch.uint8) 52 | mask = mask.any(dim=2) 53 | masks.append(mask) 54 | if masks: 55 | masks = torch.stack(masks, dim=0) 56 | else: 57 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 58 | return masks 59 | 60 | 61 | class ConvertCocoPolysToMask(object): 62 | def __init__(self, return_masks=False): 63 | self.return_masks = return_masks 64 | 65 | def __call__(self, image, target): 66 | w, h = image.size 67 | 68 | image_id = target["image_id"] 69 | image_id = torch.tensor([image_id]) 70 | 71 | anno = target["annotations"] 72 | 73 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 74 | 75 | boxes = [obj["bbox"] for obj in anno] 76 | # guard against no boxes via resizing 77 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 78 | boxes[:, 2:] += boxes[:, :2] 79 | boxes[:, 0::2].clamp_(min=0, max=w) 80 | boxes[:, 1::2].clamp_(min=0, max=h) 81 | 82 | classes = [obj["category_id"] for obj in anno] 83 | classes = torch.tensor(classes, dtype=torch.int64) 84 | 85 | if self.return_masks: 86 | segmentations = [obj["segmentation_refined"] for obj in anno] 87 | masks = convert_coco_poly_to_mask(segmentations, h, w) 88 | 89 | keypoints = None 90 | if anno and "keypoints" in anno[0]: 91 | keypoints = [obj["keypoints"] for obj in anno] 92 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 93 | num_keypoints = keypoints.shape[0] 94 | if num_keypoints: 95 | keypoints = keypoints.view(num_keypoints, -1, 3) 96 | 97 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 98 | boxes = boxes[keep] 99 | classes = classes[keep] 100 | if self.return_masks: 101 | masks = masks[keep] 102 | if keypoints is not None: 103 | keypoints = keypoints[keep] 104 | 105 | target = {} 106 | target["boxes"] = boxes 107 | target["labels"] = classes 108 | if self.return_masks: 109 | target["masks"] = masks 110 | target["image_id"] = image_id 111 | if keypoints is not None: 112 | target["keypoints"] = keypoints 113 | 114 | # for conversion to coco api 115 | area = torch.tensor([obj["area"] for obj in anno]) 116 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 117 | target["area"] = area[keep] 118 | target["iscrowd"] = iscrowd[keep] 119 | 120 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 121 | target["size"] = torch.as_tensor([int(h), int(w)]) 122 | 123 | return image, target 124 | 125 | 126 | def make_coco_transforms(image_set): 127 | 128 | normalize = T.Compose([ 129 | T.ToTensor(), 130 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 131 | ]) 132 | 133 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768] 134 | # scales = [296, 328, 360, 392] 135 | 136 | if image_set == 'train': 137 | return T.Compose([ 138 | T.RandomHorizontalFlip(), 139 | T.RandomSelect( 140 | T.RandomResize(scales, max_size=1333), 141 | T.Compose([ 142 | T.RandomResize([400, 500, 600]), 143 | T.RandomSizeCrop(384, 600), 144 | T.RandomResize(scales, max_size=1333), 145 | ]) 146 | ), 147 | normalize, 148 | ]) 149 | 150 | if image_set == 'val': 151 | return T.Compose([ 152 | T.RandomResize([800], max_size=1333), 153 | normalize, 154 | ]) 155 | 156 | raise ValueError(f'unknown {image_set}') 157 | 158 | 159 | def build(image_set, args): 160 | root = Path(args.coco_path) 161 | assert root.exists(), f'provided COCO path {root} does not exist' 162 | mode = 'instances' 163 | dataset_type = args.dataset_type 164 | if args.dataset_file == 'coco': 165 | PATHS = { 166 | "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), 167 | "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), 168 | } 169 | 170 | 171 | img_folder, ann_file = PATHS[image_set] 172 | dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, 173 | cache_mode=args.cache_mode, local_rank=get_local_rank(), local_size=get_local_size()) 174 | return dataset 175 | 176 | 177 | -------------------------------------------------------------------------------- /datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import torch 15 | from PIL import Image 16 | 17 | from panopticapi.utils import rgb2id 18 | from util.box_ops import masks_to_boxes 19 | 20 | from .coco import make_coco_transforms 21 | 22 | 23 | class CocoPanoptic: 24 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 25 | with open(ann_file, 'r') as f: 26 | self.coco = json.load(f) 27 | 28 | # sort 'images' field so that they are aligned with 'annotations' 29 | # i.e., in alphabetical order 30 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 31 | # sanity check 32 | if "annotations" in self.coco: 33 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 34 | assert img['file_name'][:-4] == ann['file_name'][:-4] 35 | 36 | self.img_folder = img_folder 37 | self.ann_folder = ann_folder 38 | self.ann_file = ann_file 39 | self.transforms = transforms 40 | self.return_masks = return_masks 41 | 42 | def __getitem__(self, idx): 43 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 44 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 45 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 46 | 47 | img = Image.open(img_path).convert('RGB') 48 | w, h = img.size 49 | if "segments_info" in ann_info: 50 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 51 | masks = rgb2id(masks) 52 | 53 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 54 | masks = masks == ids[:, None, None] 55 | 56 | masks = torch.as_tensor(masks, dtype=torch.uint8) 57 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 58 | 59 | target = {} 60 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 61 | if self.return_masks: 62 | target['masks'] = masks 63 | target['labels'] = labels 64 | 65 | target["boxes"] = masks_to_boxes(masks) 66 | 67 | target['size'] = torch.as_tensor([int(h), int(w)]) 68 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 69 | if "segments_info" in ann_info: 70 | for name in ['iscrowd', 'area']: 71 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 72 | 73 | if self.transforms is not None: 74 | img, target = self.transforms(img, target) 75 | 76 | return img, target 77 | 78 | def __len__(self): 79 | return len(self.coco['images']) 80 | 81 | def get_height_and_width(self, idx): 82 | img_info = self.coco['images'][idx] 83 | height = img_info['height'] 84 | width = img_info['width'] 85 | return height, width 86 | 87 | 88 | def build(image_set, args): 89 | img_folder_root = Path(args.coco_path) 90 | ann_folder_root = Path(args.coco_panoptic_path) 91 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 92 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 93 | mode = 'panoptic' 94 | PATHS = { 95 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 96 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 97 | } 98 | 99 | img_folder, ann_file = PATHS[image_set] 100 | img_folder_path = img_folder_root / img_folder 101 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 102 | ann_file = ann_folder_root / ann_file 103 | 104 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 105 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 106 | 107 | return dataset 108 | -------------------------------------------------------------------------------- /datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import torch 4 | import torch.utils.data 5 | 6 | from util.misc import get_local_rank, get_local_size 7 | import datasets.transforms_clip as T 8 | from torch.utils.data import Dataset, ConcatDataset 9 | from .coco2seq import build as build_seq_coco 10 | from .ytvos import build as build_ytvs 11 | 12 | 13 | 14 | def build(image_set, args): 15 | print('preparing coco2seq dataset ....') 16 | coco_seq = build_seq_coco(image_set, args) 17 | print('preparing hq ytvis dataset .... ') 18 | ytvis_dataset = build_ytvs(image_set, args) 19 | 20 | concat_data = ConcatDataset([ytvis_dataset, coco_seq]) 21 | 22 | return concat_data 23 | 24 | 25 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | 9 | def to_cuda(samples, targets, device): 10 | samples = samples.to(device, non_blocking=True) 11 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 12 | # targets_n = [] 13 | # for t in targets: 14 | # t_d = {} 15 | # for k, v in t.items(): 16 | # if k != 'path': 17 | # t_d[k] = v.to(device, non_blocking=True) 18 | # else: 19 | # t_d[k] = v 20 | # #targets_n.append({k: v.to(device, non_blocking=True) for k, v in t.items()}) 21 | # targets_n.append(t_d) 22 | 23 | return samples, targets 24 | 25 | class data_prefetcher(): 26 | def __init__(self, loader, device, prefetch=True): 27 | self.loader = iter(loader) 28 | self.prefetch = prefetch 29 | self.device = device 30 | if prefetch: 31 | self.stream = torch.cuda.Stream() 32 | self.preload() 33 | 34 | def preload(self): 35 | try: 36 | self.next_samples, self.next_targets = next(self.loader) 37 | except StopIteration: 38 | self.next_samples = None 39 | self.next_targets = None 40 | return 41 | # if record_stream() doesn't work, another option is to make sure device inputs are created 42 | # on the main stream. 43 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 44 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 45 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 46 | # at the time we start copying to next_*: 47 | # self.stream.wait_stream(torch.cuda.current_stream()) 48 | with torch.cuda.stream(self.stream): 49 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 50 | # more code for the alternative if record_stream() doesn't work: 51 | # copy_ will record the use of the pinned source tensor in this side stream. 52 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 53 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 54 | # self.next_input = self.next_input_gpu 55 | # self.next_target = self.next_target_gpu 56 | 57 | # With Amp, it isn't necessary to manually convert data to half. 58 | # if args.fp16: 59 | # self.next_input = self.next_input.half() 60 | # else: 61 | 62 | def next(self): 63 | if self.prefetch: 64 | torch.cuda.current_stream().wait_stream(self.stream) 65 | samples = self.next_samples 66 | targets = self.next_targets 67 | if samples is not None: 68 | samples.record_stream(torch.cuda.current_stream()) 69 | if targets is not None: 70 | for t in targets: 71 | for k, v in t.items(): 72 | v.record_stream(torch.cuda.current_stream()) 73 | self.preload() 74 | else: 75 | try: 76 | samples, targets = next(self.loader) 77 | samples, targets = to_cuda(samples, targets, self.device) 78 | except StopIteration: 79 | samples = None 80 | targets = None 81 | return samples, targets 82 | -------------------------------------------------------------------------------- /datasets/image_to_seq_augmenter.py: -------------------------------------------------------------------------------- 1 | import imgaug 2 | import imgaug.augmenters as iaa 3 | import numpy as np 4 | 5 | from datetime import datetime 6 | 7 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage 8 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage 9 | 10 | 11 | class ImageToSeqAugmenter(object): 12 | def __init__(self, perspective=True, affine=True, motion_blur=True, 13 | brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12, 14 | scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20), 15 | motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5): 16 | 17 | self.basic_augmenter = iaa.SomeOf((1, None), [ 18 | iaa.Add(brightness_range), 19 | iaa.AddToHueAndSaturation(hue_saturation_range) 20 | ] 21 | ) 22 | 23 | transforms = [] 24 | if perspective: 25 | transforms.append(iaa.PerspectiveTransform(perspective_magnitude)) 26 | if affine: 27 | transforms.append(iaa.Affine(scale=scale_range, 28 | translate_percent=translate_range, 29 | rotate=rotation_range, 30 | order=1, # cv2.INTER_LINEAR 31 | backend='auto')) 32 | transforms = iaa.Sequential(transforms) 33 | transforms = [transforms] 34 | 35 | if motion_blur: 36 | blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf( 37 | [ 38 | iaa.MotionBlur(ksize) 39 | for ksize in motion_blur_kernel_sizes 40 | ] 41 | )) 42 | transforms.append(blur) 43 | 44 | self.frame_shift_augmenter = iaa.Sequential(transforms) 45 | 46 | @staticmethod 47 | def condense_masks(instance_masks): 48 | condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8) 49 | for instance_id, mask in enumerate(instance_masks, 1): 50 | condensed_mask = np.where(mask, instance_id, condensed_mask) 51 | 52 | return condensed_mask 53 | 54 | @staticmethod 55 | def expand_masks(condensed_mask, num_instances): 56 | return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)] 57 | 58 | def __call__(self, image, masks=None, boxes=None): 59 | det_augmenter = self.frame_shift_augmenter.to_deterministic() 60 | 61 | 62 | if masks is not None: 63 | masks_np, is_binary_mask = [], [] 64 | boxs_np = [] 65 | 66 | for mask in masks: 67 | 68 | if isinstance(mask, np.ndarray): 69 | masks_np.append(mask.astype(np.bool)) 70 | is_binary_mask.append(False) 71 | else: 72 | raise ValueError("Invalid mask type: {}".format(type(mask))) 73 | 74 | num_instances = len(masks_np) 75 | masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2]) 76 | # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2]) 77 | 78 | seed = int(datetime.now().strftime('%M%S%f')[-8:]) 79 | imgaug.seed(seed) 80 | aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np) 81 | imgaug.seed(seed) 82 | invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2) 83 | aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances) 84 | # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image() 85 | aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)] 86 | return aug_image, aug_masks #, aug_boxes.to_xyxy_array() 87 | 88 | else: 89 | masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])] 90 | aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks) 91 | return aug_image, invalid_pts_mask.get_arr() == 0 92 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | import os 12 | 13 | import util.misc as utils 14 | 15 | try: 16 | from panopticapi.evaluation import pq_compute 17 | except ImportError: 18 | pass 19 | 20 | 21 | class PanopticEvaluator(object): 22 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 23 | self.gt_json = ann_file 24 | self.gt_folder = ann_folder 25 | if utils.is_main_process(): 26 | if not os.path.exists(output_dir): 27 | os.mkdir(output_dir) 28 | self.output_dir = output_dir 29 | self.predictions = [] 30 | 31 | def update(self, predictions): 32 | for p in predictions: 33 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 34 | f.write(p.pop("png_string")) 35 | 36 | self.predictions += predictions 37 | 38 | def synchronize_between_processes(self): 39 | all_predictions = utils.all_gather(self.predictions) 40 | merged_predictions = [] 41 | for p in all_predictions: 42 | merged_predictions += p 43 | self.predictions = merged_predictions 44 | 45 | def summarize(self): 46 | if utils.is_main_process(): 47 | json_data = {"annotations": self.predictions} 48 | predictions_json = os.path.join(self.output_dir, "predictions.json") 49 | with open(predictions_json, "w") as f: 50 | f.write(json.dumps(json_data)) 51 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 52 | return None 53 | -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from codes in torch.utils.data.distributed 7 | # ------------------------------------------------------------------------ 8 | 9 | import os 10 | import math 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data.sampler import Sampler 14 | 15 | 16 | class DistributedSampler(Sampler): 17 | """Sampler that restricts data loading to a subset of the dataset. 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | .. note:: 23 | Dataset is assumed to be of constant size. 24 | Arguments: 25 | dataset: Dataset used for sampling. 26 | num_replicas (optional): Number of processes participating in 27 | distributed training. 28 | rank (optional): Rank of the current process within num_replicas. 29 | """ 30 | 31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 32 | if num_replicas is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | num_replicas = dist.get_world_size() 36 | if rank is None: 37 | if not dist.is_available(): 38 | raise RuntimeError("Requires distributed package to be available") 39 | rank = dist.get_rank() 40 | self.dataset = dataset 41 | self.num_replicas = num_replicas 42 | self.rank = rank 43 | self.epoch = 0 44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 45 | self.total_size = self.num_samples * self.num_replicas 46 | self.shuffle = shuffle 47 | 48 | def __iter__(self): 49 | if self.shuffle: 50 | # deterministically shuffle based on epoch 51 | g = torch.Generator() 52 | g.manual_seed(self.epoch) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | 74 | 75 | class NodeDistributedSampler(Sampler): 76 | """Sampler that restricts data loading to a subset of the dataset. 77 | It is especially useful in conjunction with 78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 79 | process can pass a DistributedSampler instance as a DataLoader sampler, 80 | and load a subset of the original dataset that is exclusive to it. 81 | .. note:: 82 | Dataset is assumed to be of constant size. 83 | Arguments: 84 | dataset: Dataset used for sampling. 85 | num_replicas (optional): Number of processes participating in 86 | distributed training. 87 | rank (optional): Rank of the current process within num_replicas. 88 | """ 89 | 90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 91 | if num_replicas is None: 92 | if not dist.is_available(): 93 | raise RuntimeError("Requires distributed package to be available") 94 | num_replicas = dist.get_world_size() 95 | if rank is None: 96 | if not dist.is_available(): 97 | raise RuntimeError("Requires distributed package to be available") 98 | rank = dist.get_rank() 99 | if local_rank is None: 100 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 101 | if local_size is None: 102 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 103 | self.dataset = dataset 104 | self.shuffle = shuffle 105 | self.num_replicas = num_replicas 106 | self.num_parts = local_size 107 | self.rank = rank 108 | self.local_rank = local_rank 109 | self.epoch = 0 110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 111 | self.total_size = self.num_samples * self.num_replicas 112 | 113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 114 | 115 | def __iter__(self): 116 | if self.shuffle: 117 | # deterministically shuffle based on epoch 118 | g = torch.Generator() 119 | g.manual_seed(self.epoch) 120 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 121 | else: 122 | indices = torch.arange(len(self.dataset)).tolist() 123 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 124 | 125 | # add extra samples to make it evenly divisible 126 | indices += indices[:(self.total_size_parts - len(indices))] 127 | assert len(indices) == self.total_size_parts 128 | 129 | # subsample 130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 131 | assert len(indices) == self.num_samples 132 | 133 | return iter(indices) 134 | 135 | def __len__(self): 136 | return self.num_samples 137 | 138 | def set_epoch(self, epoch): 139 | self.epoch = epoch 140 | 141 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | from .coco import CocoDetection 8 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from torchvision 7 | # ------------------------------------------------------------------------ 8 | 9 | """ 10 | Copy-Paste from torchvision, but add utility of caching images on memory 11 | """ 12 | from torchvision.datasets.vision import VisionDataset 13 | from PIL import Image 14 | import os 15 | import os.path 16 | import tqdm 17 | from io import BytesIO 18 | 19 | 20 | class CocoDetection(VisionDataset): 21 | """`MS Coco Detection `_ Dataset. 22 | Args: 23 | root (string): Root directory where images are downloaded to. 24 | annFile (string): Path to json annotation file. 25 | transform (callable, optional): A function/transform that takes in an PIL image 26 | and returns a transformed version. E.g, ``transforms.ToTensor`` 27 | target_transform (callable, optional): A function/transform that takes in the 28 | target and transforms it. 29 | transforms (callable, optional): A function/transform that takes input sample and its target as entry 30 | and returns a transformed version. 31 | """ 32 | 33 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None, 34 | cache_mode=False, local_rank=0, local_size=1): 35 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform) 36 | from pycocotools.coco import COCO 37 | self.coco = COCO(annFile) 38 | self.ids = list(sorted(self.coco.imgs.keys())) 39 | self.cache_mode = cache_mode 40 | self.local_rank = local_rank 41 | self.local_size = local_size 42 | if cache_mode: 43 | self.cache = {} 44 | self.cache_images() 45 | 46 | def cache_images(self): 47 | self.cache = {} 48 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids): 49 | if index % self.local_size != self.local_rank: 50 | continue 51 | path = self.coco.loadImgs(img_id)[0]['file_name'] 52 | with open(os.path.join(self.root, path), 'rb') as f: 53 | self.cache[path] = f.read() 54 | 55 | def get_image(self, path): 56 | if self.cache_mode: 57 | if path not in self.cache.keys(): 58 | with open(os.path.join(self.root, path), 'rb') as f: 59 | self.cache[path] = f.read() 60 | return Image.open(BytesIO(self.cache[path])).convert('RGB') 61 | return Image.open(os.path.join(self.root, path)).convert('RGB') 62 | 63 | def __getitem__(self, index): 64 | """ 65 | Args: 66 | index (int): Index 67 | Returns: 68 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 69 | """ 70 | coco = self.coco 71 | img_id = self.ids[index] 72 | ann_ids = coco.getAnnIds(imgIds=img_id) 73 | target = coco.loadAnns(ann_ids) 74 | 75 | path = coco.loadImgs(img_id)[0]['file_name'] 76 | 77 | img = self.get_image(path) 78 | if self.transforms is not None: 79 | img, target = self.transforms(img, target) 80 | 81 | return img, target, path 82 | 83 | def __len__(self): 84 | return len(self.ids) 85 | -------------------------------------------------------------------------------- /eval_hqvis.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | from pycocotools.ytvos import YTVOS 5 | from pycocotools.ytvoseval import YTVOSeval 6 | 7 | import warnings 8 | warnings.filterwarnings("ignore", category=DeprecationWarning) 9 | 10 | 11 | def ytvos_eval(result_file, result_types, ytvos, get_boundary_out, max_dets=(100, 300, 1000)): 12 | 13 | ytvos = YTVOS(ytvos, get_boundary=get_boundary_out) 14 | assert isinstance(ytvos, YTVOS) 15 | 16 | if len(ytvos.anns) == 0: 17 | print("Annotations does not exist") 18 | return 19 | 20 | assert result_file.endswith('.json') 21 | ytvos_dets = ytvos.loadRes(result_file) 22 | 23 | vid_ids = ytvos.getVidIds() 24 | for res_type in result_types: 25 | iou_type = res_type 26 | ytvosEval = YTVOSeval(ytvos, ytvos_dets, iou_type) 27 | ytvosEval.params.vidIds = vid_ids 28 | if res_type == 'proposal': 29 | ytvosEval.params.useCats = 0 30 | ytvosEval.params.maxDets = list(max_dets) 31 | ytvosEval.evaluate() 32 | ytvosEval.accumulate() 33 | ytvosEval.summarize() 34 | 35 | def main(args): 36 | result_file = args.save_path 37 | ytvos = 'ytvos' 38 | ytvos_eval(result_file, ['boundary'], 'ytvis/annotations/ytvis_hq-test.json', True, max_dets=(100, 300, 1000)) 39 | ytvos_eval(result_file, ['segm'], 'ytvis/annotations/ytvis_hq-test.json', False, max_dets=(100, 300, 1000)) 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser('inference script') 43 | parser.add_argument('--save-path') 44 | args = parser.parse_args() 45 | main(args) 46 | -------------------------------------------------------------------------------- /figures/data1_new.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/data1_new.gif -------------------------------------------------------------------------------- /figures/dataset_compare_s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/dataset_compare_s.png -------------------------------------------------------------------------------- /figures/result_demo1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/result_demo1.gif -------------------------------------------------------------------------------- /figures/vmt_banner_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/figures/vmt_banner_img.png -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .vmt import build 2 | 3 | 4 | def build_model(args): 5 | return build(args) 6 | 7 | -------------------------------------------------------------------------------- /models/backbone.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Backbone modules. 4 | """ 5 | from collections import OrderedDict 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import torchvision 10 | from torch import nn 11 | from torchvision.models._utils import IntermediateLayerGetter 12 | from typing import Dict, List 13 | 14 | from util.misc import NestedTensor, is_main_process 15 | 16 | from .position_encoding import build_position_encoding 17 | from .x101_64d import resnext101_64x4d 18 | 19 | class FrozenBatchNorm2d(torch.nn.Module): 20 | """ 21 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 22 | 23 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 24 | without which any other models than torchvision.models.resnet[18,34,50,101] 25 | produce nans. 26 | """ 27 | 28 | def __init__(self, n, eps=1e-5): 29 | super(FrozenBatchNorm2d, self).__init__() 30 | self.register_buffer("weight", torch.ones(n)) 31 | self.register_buffer("bias", torch.zeros(n)) 32 | self.register_buffer("running_mean", torch.zeros(n)) 33 | self.register_buffer("running_var", torch.ones(n)) 34 | self.eps = eps 35 | 36 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 37 | missing_keys, unexpected_keys, error_msgs): 38 | num_batches_tracked_key = prefix + 'num_batches_tracked' 39 | if num_batches_tracked_key in state_dict: 40 | del state_dict[num_batches_tracked_key] 41 | 42 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 43 | state_dict, prefix, local_metadata, strict, 44 | missing_keys, unexpected_keys, error_msgs) 45 | 46 | def forward(self, x): 47 | # move reshapes to the beginning 48 | # to make it fuser-friendly 49 | w = self.weight.reshape(1, -1, 1, 1) 50 | b = self.bias.reshape(1, -1, 1, 1) 51 | rv = self.running_var.reshape(1, -1, 1, 1) 52 | rm = self.running_mean.reshape(1, -1, 1, 1) 53 | eps = self.eps 54 | scale = w * (rv + eps).rsqrt() 55 | bias = b - rm * scale 56 | return x * scale + bias 57 | 58 | 59 | class BackboneBase(nn.Module): 60 | 61 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): 62 | super().__init__() 63 | for name, parameter in backbone.named_parameters(): 64 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 65 | parameter.requires_grad_(False) 66 | if return_interm_layers: 67 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3", "conv1": "4"} 68 | self.strides = [8, 16, 32] 69 | self.num_channels = [512, 1024, 2048] 70 | else: 71 | return_layers = {'layer4': "0"} 72 | self.strides = [32] 73 | self.num_channels = [2048] 74 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 75 | 76 | def forward(self, tensor_list: NestedTensor): 77 | xs = self.body(tensor_list.tensors) 78 | out: Dict[str, NestedTensor] = {} 79 | for name, x in xs.items(): 80 | m = tensor_list.mask 81 | assert m is not None 82 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 83 | out[name] = NestedTensor(x, mask) 84 | return out 85 | 86 | 87 | class Backbone(BackboneBase): 88 | """ResNet backbone with frozen BatchNorm.""" 89 | def __init__(self, name: str, 90 | train_backbone: bool, 91 | return_interm_layers: bool, 92 | dilation: bool): 93 | norm_layer = FrozenBatchNorm2d 94 | if name == 'resnext101_64x4d': 95 | backbone = resnext101_64x4d(replace_stride_with_dilation=[False, False, dilation], 96 | pretrained=is_main_process(), norm_layer=norm_layer) 97 | else: 98 | backbone = getattr(torchvision.models, name)( 99 | replace_stride_with_dilation=[False, False, dilation], 100 | pretrained=is_main_process(), norm_layer=norm_layer) #pretrained=is_main_process() 101 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded" 102 | super().__init__(backbone, train_backbone, return_interm_layers) 103 | if dilation: 104 | self.strides[-1] = self.strides[-1] // 2 105 | 106 | 107 | class Joiner(nn.Sequential): 108 | def __init__(self, backbone, position_embedding): 109 | super().__init__(backbone, position_embedding) 110 | self.strides = backbone.strides 111 | self.num_channels = backbone.num_channels 112 | 113 | def forward(self, tensor_list: NestedTensor): 114 | xs = self[0](tensor_list) 115 | out: List[NestedTensor] = [] 116 | pos = [] 117 | for name, x in sorted(xs.items()): 118 | out.append(x) 119 | 120 | # position encoding 121 | for x in out: 122 | pos.append(self[1](x).to(x.tensors.dtype)) 123 | 124 | return out, pos 125 | 126 | 127 | def build_backbone(args): 128 | position_embedding = build_position_encoding(args) 129 | train_backbone = args.lr_backbone > 0 130 | return_interm_layers = args.masks or (args.num_feature_levels > 1) 131 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) 132 | model = Joiner(backbone, position_embedding) 133 | return model 134 | 135 | -------------------------------------------------------------------------------- /models/matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modules to compute the matching cost and solve the corresponding LSAP. 3 | """ 4 | import torch 5 | from scipy.optimize import linear_sum_assignment 6 | from torch import nn 7 | import torch.nn.functional as F 8 | 9 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou 10 | 11 | 12 | class HungarianMatcher(nn.Module): 13 | 14 | 15 | def __init__(self, 16 | multi_frame: bool, 17 | cost_class: float = 1, 18 | cost_bbox: float = 1, 19 | cost_giou: float = 1, 20 | cost_mask: float = 1): 21 | """Creates the matcher 22 | Params: 23 | cost_class: This is the relative weight of the classification error in the matching cost 24 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 25 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 26 | """ 27 | super().__init__() 28 | self.multi_frame = multi_frame 29 | self.cost_class = cost_class 30 | self.cost_bbox = cost_bbox 31 | self.cost_giou = cost_giou 32 | self.cost_mask = cost_mask 33 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_mask != 0, "all costs cant be 0" 34 | 35 | 36 | def forward(self, outputs, targets, nf, valid_ratios): 37 | 38 | with torch.no_grad(): 39 | bs, num_queries = outputs["pred_logits"].shape[:2] 40 | 41 | # We flatten to compute the cost matrices in a batch 42 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 43 | # Also concat the target labels and boxes 44 | tgt_ids = torch.cat([v["labels"] for v in targets]) 45 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 46 | num_insts = len(tgt_ids) 47 | 48 | 49 | out_bbox = outputs["pred_boxes"].permute(0,2,1,3).flatten(0, 1) # [batch_size * num_queries,nf, 4] 50 | num_insts = len(tgt_ids) 51 | tgt_bbox = tgt_bbox.reshape(num_insts,nf,4) 52 | 53 | cost_bbox = torch.cdist(out_bbox.flatten(1,2), tgt_bbox.flatten(1,2)) 54 | cost_giou = 0 55 | for i in range(nf): 56 | cost_giou += -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox[:,i]), 57 | box_cxcywh_to_xyxy(tgt_bbox[:,i])) 58 | cost_giou = cost_giou/nf 59 | 60 | # Compute the classification cost. 61 | alpha = 0.25 62 | gamma = 2.0 63 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 64 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 65 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 66 | 67 | # Final cost matrix 68 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 69 | C = C.view(bs, num_queries, -1).cpu() 70 | 71 | sizes = [len(v["labels"]) for v in targets] 72 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 73 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 74 | 75 | 76 | def build_matcher(args): 77 | # output single frame, multi frame 78 | return HungarianMatcher(multi_frame=True, # True, False 79 | cost_class=args.set_cost_class, 80 | cost_bbox=args.set_cost_bbox, 81 | cost_giou=args.set_cost_giou) 82 | 83 | 84 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: MultiScaleDeformableAttention 3 | Version: 1.0 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR 6 | Author: Weijie Su 7 | License: UNKNOWN 8 | Platform: UNKNOWN 9 | 10 | UNKNOWN 11 | 12 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | /scratch/work/vmt_organized_code/models/ops/src/vision.cpp 3 | /scratch/work/vmt_organized_code/models/ops/src/cpu/ms_deform_attn_cpu.cpp 4 | /scratch/work/vmt_organized_code/models/ops/src/cuda/ms_deform_attn_cuda.cu 5 | MultiScaleDeformableAttention.egg-info/PKG-INFO 6 | MultiScaleDeformableAttention.egg-info/SOURCES.txt 7 | MultiScaleDeformableAttention.egg-info/dependency_links.txt 8 | MultiScaleDeformableAttention.egg-info/top_level.txt 9 | functions/__init__.py 10 | functions/ms_deform_attn_func.py 11 | modules/__init__.py 12 | modules/ms_deform_attn.py -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | MultiScaleDeformableAttention 2 | functions 3 | modules 4 | -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_output = grad_output.contiguous() 35 | grad_value, grad_sampling_loc, grad_attn_weight = \ 36 | MSDA.ms_deform_attn_backward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 38 | 39 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 40 | 41 | 42 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 43 | # for debug and test only, 44 | # need to use cuda version instead 45 | N_, S_, M_, D_ = value.shape 46 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 47 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 48 | sampling_grids = 2 * sampling_locations - 1 49 | sampling_value_list = [] 50 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 51 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 52 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 53 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 54 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 55 | # N_*M_, D_, Lq_, P_ 56 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 57 | mode='bilinear', padding_mode='zeros', align_corners=False) 58 | sampling_value_list.append(sampling_value_l_) 59 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 60 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 61 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 62 | return output.transpose(1, 2).contiguous() 63 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python3 setup.py build install --user 11 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from DETR (https://github.com/facebookresearch/detr) 6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 7 | # ------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from DETR (https://github.com/facebookresearch/detr) 6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 7 | # ------------------------------------------------------------------------ 8 | 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from util.misc import NestedTensor 18 | import numpy as np 19 | 20 | 21 | class PositionalEncoding3D(nn.Module): 22 | def __init__(self, channels): 23 | """ 24 | :param channels: The last dimension of the tensor you want to apply pos emb to. 25 | """ 26 | super(PositionalEncoding3D, self).__init__() 27 | channels = int(np.ceil(channels/6)*2) 28 | if channels % 2: 29 | channels += 1 30 | self.channels = channels 31 | inv_freq = 1. / (10000 ** (torch.arange(0, channels, 2).float() / channels)) 32 | self.register_buffer('inv_freq', inv_freq) 33 | 34 | def forward(self, tensor): 35 | """ 36 | :param tensor: A 5d tensor of size (batch_size, x, y, z, ch) 37 | :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch) 38 | """ 39 | if len(tensor.shape) != 5: 40 | raise RuntimeError("The input tensor has to be 5d!") 41 | batch_size, x, y, z, orig_ch = tensor.shape 42 | # print('tensor.shape shape:', tensor.shape) 43 | pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type()) 44 | pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type()) 45 | pos_z = torch.arange(z, device=tensor.device).type(self.inv_freq.type()) 46 | sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq) 47 | sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq) 48 | sin_inp_z = torch.einsum("i,j->ij", pos_z, self.inv_freq) 49 | emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1).unsqueeze(1).unsqueeze(1) 50 | emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1).unsqueeze(1) 51 | emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1) 52 | emb = torch.zeros((x,y,z,self.channels*3),device=tensor.device).type(tensor.type()) 53 | emb[:,:,:,:self.channels] = emb_x 54 | emb[:,:,:,self.channels:2*self.channels] = emb_y 55 | emb[:,:,:,2*self.channels:] = emb_z 56 | 57 | return emb[None,:,:,:,:orig_ch].repeat(batch_size, 1, 1, 1, 1) 58 | 59 | 60 | class PositionEmbeddingSine(nn.Module): 61 | """ 62 | This is a more standard version of the position embedding, very similar to the one 63 | used by the Attention is all you need paper, generalized to work on images. 64 | """ 65 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 66 | super().__init__() 67 | self.num_pos_feats = num_pos_feats 68 | self.temperature = temperature 69 | self.normalize = normalize 70 | if scale is not None and normalize is False: 71 | raise ValueError("normalize should be True if scale is passed") 72 | if scale is None: 73 | scale = 2 * math.pi 74 | self.scale = scale 75 | 76 | def forward(self, tensor_list: NestedTensor): 77 | x = tensor_list.tensors 78 | mask = tensor_list.mask 79 | assert mask is not None 80 | not_mask = ~mask 81 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 82 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 83 | if self.normalize: 84 | eps = 1e-6 85 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 86 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 87 | 88 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 89 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 90 | 91 | pos_x = x_embed[:, :, :, None] / dim_t 92 | pos_y = y_embed[:, :, :, None] / dim_t 93 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 94 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 95 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 96 | return pos 97 | 98 | 99 | class PositionEmbeddingLearned(nn.Module): 100 | """ 101 | Absolute pos embedding, learned. 102 | """ 103 | def __init__(self, num_pos_feats=256): 104 | super().__init__() 105 | self.row_embed = nn.Embedding(50, num_pos_feats) 106 | self.col_embed = nn.Embedding(50, num_pos_feats) 107 | self.reset_parameters() 108 | 109 | def reset_parameters(self): 110 | nn.init.uniform_(self.row_embed.weight) 111 | nn.init.uniform_(self.col_embed.weight) 112 | 113 | def forward(self, tensor_list: NestedTensor): 114 | x = tensor_list.tensors 115 | h, w = x.shape[-2:] 116 | i = torch.arange(w, device=x.device) 117 | j = torch.arange(h, device=x.device) 118 | x_emb = self.col_embed(i) 119 | y_emb = self.row_embed(j) 120 | pos = torch.cat([ 121 | x_emb.unsqueeze(0).repeat(h, 1, 1), 122 | y_emb.unsqueeze(1).repeat(1, w, 1), 123 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 124 | return pos 125 | 126 | 127 | def build_position_encoding(args): 128 | N_steps = args.hidden_dim // 2 129 | if args.position_embedding in ('v2', 'sine'): 130 | # TODO find a better way of exposing other arguments 131 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 132 | elif args.position_embedding in ('v3', 'learned'): 133 | position_embedding = PositionEmbeddingLearned(N_steps) 134 | else: 135 | raise ValueError(f"not supported {args.position_embedding}") 136 | 137 | return position_embedding 138 | -------------------------------------------------------------------------------- /models/x101_64d.py: -------------------------------------------------------------------------------- 1 | from torchvision.models.resnet import _resnet, Bottleneck 2 | 3 | 4 | def resnext101_64x4d(pretrained=False, progress=True, **kwargs): 5 | r"""ResNeXt-101 64*4d model from 6 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 7 | 8 | Args: 9 | pretrained (bool): If True, returns a model pre-trained on ImageNet 10 | progress (bool): If True, displays a progress bar of the download to stderr 11 | """ 12 | kwargs['groups'] = 64 13 | kwargs['width_per_group'] = 4 14 | return _resnet('resnext101_64x4d', Bottleneck, [3, 4, 23, 3], 15 | False, progress, **kwargs) 16 | -------------------------------------------------------------------------------- /models_swin/__init__.py: -------------------------------------------------------------------------------- 1 | from .vmt import build 2 | 3 | 4 | def build_model(args): 5 | return build(args) 6 | 7 | -------------------------------------------------------------------------------- /models_swin/backbone.py: -------------------------------------------------------------------------------- 1 | """ 2 | Backbone modules. 3 | """ 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | import torchvision 9 | from torch import nn 10 | from torchvision.models._utils import IntermediateLayerGetter 11 | from typing import Dict, List 12 | 13 | from util.misc import NestedTensor, is_main_process 14 | 15 | from .position_encoding import build_position_encoding 16 | from .x101_64d import resnext101_64x4d 17 | 18 | class FrozenBatchNorm2d(torch.nn.Module): 19 | """ 20 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 21 | 22 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 23 | without which any other models than torchvision.models.resnet[18,34,50,101] 24 | produce nans. 25 | """ 26 | 27 | def __init__(self, n, eps=1e-5): 28 | super(FrozenBatchNorm2d, self).__init__() 29 | self.register_buffer("weight", torch.ones(n)) 30 | self.register_buffer("bias", torch.zeros(n)) 31 | self.register_buffer("running_mean", torch.zeros(n)) 32 | self.register_buffer("running_var", torch.ones(n)) 33 | self.eps = eps 34 | 35 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 36 | missing_keys, unexpected_keys, error_msgs): 37 | num_batches_tracked_key = prefix + 'num_batches_tracked' 38 | if num_batches_tracked_key in state_dict: 39 | del state_dict[num_batches_tracked_key] 40 | 41 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 42 | state_dict, prefix, local_metadata, strict, 43 | missing_keys, unexpected_keys, error_msgs) 44 | 45 | def forward(self, x): 46 | # move reshapes to the beginning 47 | # to make it fuser-friendly 48 | w = self.weight.reshape(1, -1, 1, 1) 49 | b = self.bias.reshape(1, -1, 1, 1) 50 | rv = self.running_var.reshape(1, -1, 1, 1) 51 | rm = self.running_mean.reshape(1, -1, 1, 1) 52 | eps = self.eps 53 | scale = w * (rv + eps).rsqrt() 54 | bias = b - rm * scale 55 | return x * scale + bias 56 | 57 | 58 | class BackboneBase(nn.Module): 59 | 60 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): 61 | super().__init__() 62 | for name, parameter in backbone.named_parameters(): 63 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 64 | parameter.requires_grad_(False) 65 | if return_interm_layers: 66 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3", "conv1": "4"} 67 | self.strides = [8, 16, 32] 68 | self.num_channels = [512, 1024, 2048] 69 | else: 70 | return_layers = {'layer4': "0"} 71 | self.strides = [32] 72 | self.num_channels = [2048] 73 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 74 | 75 | def forward(self, tensor_list: NestedTensor): 76 | xs = self.body(tensor_list.tensors) 77 | out: Dict[str, NestedTensor] = {} 78 | for name, x in xs.items(): 79 | m = tensor_list.mask 80 | assert m is not None 81 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 82 | out[name] = NestedTensor(x, mask) 83 | return out 84 | 85 | 86 | class Backbone(BackboneBase): 87 | """ResNet backbone with frozen BatchNorm.""" 88 | def __init__(self, name: str, 89 | train_backbone: bool, 90 | return_interm_layers: bool, 91 | dilation: bool): 92 | norm_layer = FrozenBatchNorm2d 93 | if name == 'resnext101_64x4d': 94 | backbone = resnext101_64x4d(replace_stride_with_dilation=[False, False, dilation], 95 | pretrained=is_main_process(), norm_layer=norm_layer) 96 | else: 97 | backbone = getattr(torchvision.models, name)( 98 | replace_stride_with_dilation=[False, False, dilation], 99 | pretrained=is_main_process(), norm_layer=norm_layer) #pretrained=is_main_process() 100 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded" 101 | super().__init__(backbone, train_backbone, return_interm_layers) 102 | if dilation: 103 | self.strides[-1] = self.strides[-1] // 2 104 | 105 | 106 | class Joiner(nn.Sequential): 107 | def __init__(self, backbone, position_embedding): 108 | super().__init__(backbone, position_embedding) 109 | self.strides = backbone.strides 110 | self.num_channels = backbone.num_channels 111 | 112 | def forward(self, tensor_list: NestedTensor): 113 | xs = self[0](tensor_list) 114 | out: List[NestedTensor] = [] 115 | pos = [] 116 | for name, x in sorted(xs.items()): 117 | out.append(x) 118 | 119 | # position encoding 120 | for x in out: 121 | pos.append(self[1](x).to(x.tensors.dtype)) 122 | 123 | return out, pos 124 | 125 | 126 | def build_backbone(args): 127 | position_embedding = build_position_encoding(args) 128 | train_backbone = args.lr_backbone > 0 129 | return_interm_layers = args.masks or (args.num_feature_levels > 1) 130 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) 131 | model = Joiner(backbone, position_embedding) 132 | return model 133 | 134 | -------------------------------------------------------------------------------- /models_swin/matcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modules to compute the matching cost and solve the corresponding LSAP. 3 | """ 4 | import torch 5 | from scipy.optimize import linear_sum_assignment 6 | from torch import nn 7 | import torch.nn.functional as F 8 | 9 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, generalized_multi_box_iou 10 | 11 | 12 | class HungarianMatcher(nn.Module): 13 | 14 | 15 | def __init__(self, 16 | multi_frame: bool, 17 | cost_class: float = 1, 18 | cost_bbox: float = 1, 19 | cost_giou: float = 1, 20 | cost_mask: float = 1): 21 | """Creates the matcher 22 | Params: 23 | cost_class: This is the relative weight of the classification error in the matching cost 24 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 25 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 26 | """ 27 | super().__init__() 28 | self.multi_frame = multi_frame 29 | self.cost_class = cost_class 30 | self.cost_bbox = cost_bbox 31 | self.cost_giou = cost_giou 32 | self.cost_mask = cost_mask 33 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_mask != 0, "all costs cant be 0" 34 | 35 | 36 | def forward(self, outputs, targets, nf, valid_ratios): 37 | 38 | with torch.no_grad(): 39 | bs, num_queries = outputs["pred_logits"].shape[:2] 40 | 41 | # We flatten to compute the cost matrices in a batch 42 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 43 | # Also concat the target labels and boxes 44 | tgt_ids = torch.cat([v["labels"] for v in targets]) 45 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 46 | num_insts = len(tgt_ids) 47 | 48 | 49 | out_bbox = outputs["pred_boxes"].permute(0,2,1,3).flatten(0, 1) # [batch_size * num_queries,nf, 4] 50 | num_insts = len(tgt_ids) 51 | tgt_bbox = tgt_bbox.reshape(num_insts,nf,4) 52 | 53 | cost_bbox = torch.cdist(out_bbox.flatten(1,2), tgt_bbox.flatten(1,2)) 54 | cost_giou = 0 55 | for i in range(nf): 56 | cost_giou += -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox[:,i]), 57 | box_cxcywh_to_xyxy(tgt_bbox[:,i])) 58 | cost_giou = cost_giou/nf 59 | 60 | # Compute the classification cost. 61 | alpha = 0.25 62 | gamma = 2.0 63 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 64 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 65 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 66 | 67 | # Final cost matrix 68 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 69 | C = C.view(bs, num_queries, -1).cpu() 70 | 71 | sizes = [len(v["labels"]) for v in targets] 72 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 73 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 74 | 75 | 76 | def build_matcher(args): 77 | # output single frame, multi frame 78 | return HungarianMatcher(multi_frame=True, # True, False 79 | cost_class=args.set_cost_class, 80 | cost_bbox=args.set_cost_bbox, 81 | cost_giou=args.set_cost_giou) 82 | 83 | 84 | -------------------------------------------------------------------------------- /models_swin/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: MultiScaleDeformableAttention 3 | Version: 1.0 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR 6 | Author: Weijie Su 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /models_swin/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | MultiScaleDeformableAttention.egg-info/PKG-INFO 3 | MultiScaleDeformableAttention.egg-info/SOURCES.txt 4 | MultiScaleDeformableAttention.egg-info/dependency_links.txt 5 | MultiScaleDeformableAttention.egg-info/top_level.txt 6 | functions/__init__.py 7 | functions/ms_deform_attn_func.py 8 | modules/__init__.py 9 | modules/ms_deform_attn.py -------------------------------------------------------------------------------- /models_swin/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models_swin/ops/MultiScaleDeformableAttention.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | MultiScaleDeformableAttention 2 | functions 3 | modules 4 | -------------------------------------------------------------------------------- /models_swin/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models_swin/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_output = grad_output.contiguous() 35 | grad_value, grad_sampling_loc, grad_attn_weight = \ 36 | MSDA.ms_deform_attn_backward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 38 | 39 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 40 | 41 | 42 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 43 | # for debug and test only, 44 | # need to use cuda version instead 45 | N_, S_, M_, D_ = value.shape 46 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 47 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 48 | sampling_grids = 2 * sampling_locations - 1 49 | sampling_value_list = [] 50 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 51 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 52 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 53 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 54 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 55 | # N_*M_, D_, Lq_, P_ 56 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 57 | mode='bilinear', padding_mode='zeros', align_corners=False) 58 | sampling_value_list.append(sampling_value_l_) 59 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 60 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 61 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 62 | return output.transpose(1, 2).contiguous() 63 | -------------------------------------------------------------------------------- /models_swin/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python3 setup.py build install --user 11 | -------------------------------------------------------------------------------- /models_swin/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from DETR (https://github.com/facebookresearch/detr) 6 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 7 | # ------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn -------------------------------------------------------------------------------- /models_swin/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /models_swin/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /models_swin/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models_swin/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /models_swin/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models_swin/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /models_swin/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models_swin/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models_swin/position_encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various positional encodings for the transformer. 3 | """ 4 | import math 5 | import torch 6 | from torch import nn 7 | 8 | from util.misc import NestedTensor 9 | import numpy as np 10 | 11 | 12 | class PositionalEncoding3D(nn.Module): 13 | def __init__(self, channels): 14 | """ 15 | :param channels: The last dimension of the tensor you want to apply pos emb to. 16 | """ 17 | super(PositionalEncoding3D, self).__init__() 18 | channels = int(np.ceil(channels/6)*2) 19 | if channels % 2: 20 | channels += 1 21 | self.channels = channels 22 | inv_freq = 1. / (10000 ** (torch.arange(0, channels, 2).float() / channels)) 23 | self.register_buffer('inv_freq', inv_freq) 24 | 25 | def forward(self, tensor): 26 | """ 27 | :param tensor: A 5d tensor of size (batch_size, x, y, z, ch) 28 | :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch) 29 | """ 30 | if len(tensor.shape) != 5: 31 | raise RuntimeError("The input tensor has to be 5d!") 32 | batch_size, x, y, z, orig_ch = tensor.shape 33 | # print('tensor.shape shape:', tensor.shape) 34 | pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type()) 35 | pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type()) 36 | pos_z = torch.arange(z, device=tensor.device).type(self.inv_freq.type()) 37 | sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq) 38 | sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq) 39 | sin_inp_z = torch.einsum("i,j->ij", pos_z, self.inv_freq) 40 | emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1).unsqueeze(1).unsqueeze(1) 41 | emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1).unsqueeze(1) 42 | emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1) 43 | emb = torch.zeros((x,y,z,self.channels*3),device=tensor.device).type(tensor.type()) 44 | emb[:,:,:,:self.channels] = emb_x 45 | emb[:,:,:,self.channels:2*self.channels] = emb_y 46 | emb[:,:,:,2*self.channels:] = emb_z 47 | 48 | return emb[None,:,:,:,:orig_ch].repeat(batch_size, 1, 1, 1, 1) 49 | 50 | 51 | class PositionEmbeddingSine(nn.Module): 52 | """ 53 | This is a more standard version of the position embedding, very similar to the one 54 | used by the Attention is all you need paper, generalized to work on images. 55 | """ 56 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 57 | super().__init__() 58 | self.num_pos_feats = num_pos_feats 59 | self.temperature = temperature 60 | self.normalize = normalize 61 | if scale is not None and normalize is False: 62 | raise ValueError("normalize should be True if scale is passed") 63 | if scale is None: 64 | scale = 2 * math.pi 65 | self.scale = scale 66 | 67 | def forward(self, tensor_list: NestedTensor): 68 | x = tensor_list.tensors 69 | mask = tensor_list.mask 70 | assert mask is not None 71 | not_mask = ~mask 72 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 73 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 74 | if self.normalize: 75 | eps = 1e-6 76 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 77 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 78 | 79 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 80 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 81 | 82 | pos_x = x_embed[:, :, :, None] / dim_t 83 | pos_y = y_embed[:, :, :, None] / dim_t 84 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 85 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 86 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 87 | return pos 88 | 89 | 90 | class PositionEmbeddingLearned(nn.Module): 91 | """ 92 | Absolute pos embedding, learned. 93 | """ 94 | def __init__(self, num_pos_feats=256): 95 | super().__init__() 96 | self.row_embed = nn.Embedding(50, num_pos_feats) 97 | self.col_embed = nn.Embedding(50, num_pos_feats) 98 | self.reset_parameters() 99 | 100 | def reset_parameters(self): 101 | nn.init.uniform_(self.row_embed.weight) 102 | nn.init.uniform_(self.col_embed.weight) 103 | 104 | def forward(self, tensor_list: NestedTensor): 105 | x = tensor_list.tensors 106 | h, w = x.shape[-2:] 107 | i = torch.arange(w, device=x.device) 108 | j = torch.arange(h, device=x.device) 109 | x_emb = self.col_embed(i) 110 | y_emb = self.row_embed(j) 111 | pos = torch.cat([ 112 | x_emb.unsqueeze(0).repeat(h, 1, 1), 113 | y_emb.unsqueeze(1).repeat(1, w, 1), 114 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 115 | return pos 116 | 117 | 118 | def build_position_encoding(args): 119 | N_steps = args.hidden_dim // 2 120 | if args.position_embedding in ('v2', 'sine'): 121 | # TODO find a better way of exposing other arguments 122 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 123 | elif args.position_embedding in ('v3', 'learned'): 124 | position_embedding = PositionEmbeddingLearned(N_steps) 125 | else: 126 | raise ValueError(f"not supported {args.position_embedding}") 127 | 128 | return position_embedding 129 | -------------------------------------------------------------------------------- /models_swin/x101_64d.py: -------------------------------------------------------------------------------- 1 | from torchvision.models.resnet import _resnet, Bottleneck 2 | 3 | 4 | def resnext101_64x4d(pretrained=False, progress=True, **kwargs): 5 | r"""ResNeXt-101 64*4d model from 6 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 7 | 8 | Args: 9 | pretrained (bool): If True, returns a model pre-trained on ImageNet 10 | progress (bool): If True, displays a progress bar of the download to stderr 11 | """ 12 | kwargs['groups'] = 64 13 | kwargs['width_per_group'] = 4 14 | return _resnet('resnext101_64x4d', Bottleneck, [3, 4, 23, 3], 15 | False, progress, **kwargs) 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | tqdm 3 | cython 4 | scipy 5 | timm 6 | imgaug 7 | opencv-python 8 | kornia==0.5.11 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/eval_r101_test.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_test --masks --backbone resnet101 --model_path ./pretrained_model/checkpoint_r101_final.pth --save_path exp_r101_hq_test_result.json 2 | -------------------------------------------------------------------------------- /scripts/eval_r101_val.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference --masks --backbone resnet101 --model_path ./pretrained_model/checkpoint_r101_final.pth --save_path exp_r101_hq_val_result.json 2 | -------------------------------------------------------------------------------- /scripts/eval_r50_test.sh: -------------------------------------------------------------------------------- 1 | python3 -m tools.inference_test --masks --backbone resnet50 --model_path ./pretrained_model/checkpoint_r50_final.pth --save_path exp_r50_hq_test_result.json 2 | -------------------------------------------------------------------------------- /scripts/eval_r50_val.sh: -------------------------------------------------------------------------------- 1 | python3 -m tools.inference --masks --backbone resnet50 --model_path ./pretrained_model/checkpoint_r50_final.pth --save_path exp_r50_hq_val_result.json -------------------------------------------------------------------------------- /scripts/eval_swin_test.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_swin_test --masks --backbone swin_l_p4w12 --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_test_result.json 2 | 3 | 4 | -------------------------------------------------------------------------------- /scripts/eval_swin_val.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python3 -m tools.inference_swin --masks --backbone swin_l_p4w12 --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json 2 | 3 | 4 | -------------------------------------------------------------------------------- /scripts/eval_swin_val_vis.sh: -------------------------------------------------------------------------------- 1 | python3 -m tools.inference_swin_with_vis --masks --backbone swin_l_p4w12 --output vis_output_swin_vmt --model_path ./pretrained_model/checkpoint_swinl_final.pth --save_path exp_swin_hq_val_result.json --save-frames True 2 | 3 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/tools/__init__.py -------------------------------------------------------------------------------- /tools/visualizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import matplotlib.colors as mplc 4 | 5 | from detectron2.utils.visualizer import ColorMode, GenericMask, Visualizer, _create_text_labels 6 | 7 | 8 | _ID_JITTERS = [[0.9047944201469568, 0.3241718265806123, 0.33443746665210006], [0.4590171386127151, 0.9095038146383864, 0.3143840671974788], [0.4769356899795538, 0.5044406738441948, 0.5354530846360839], [0.00820945625670777, 0.24099210193126785, 0.15471834055332978], [0.6195684374237388, 0.4020380013509799, 0.26100266066404676], [0.08281237756545068, 0.05900744492710419, 0.06106221202154216], [0.2264886829978755, 0.04925271007292076, 0.10214429345996079], [0.1888247470009874, 0.11275000298612425, 0.46112894830685514], [0.37415767691880975, 0.844284596118331, 0.950471611180866], [0.3817344218157631, 0.3483259270707101, 0.6572989333690541], [0.2403115731054466, 0.03078280287279167, 0.5385975692534737], [0.7035076951650824, 0.12352084932325424, 0.12873080308790197], [0.12607434914489934, 0.111244793010015, 0.09333334699716023], [0.6551607300342269, 0.7003064103554443, 0.4131794512286162], [0.13592107365596595, 0.5390702818232149, 0.004540643174930525], [0.38286244894454347, 0.709142545393449, 0.529074791609835], [0.4279376583651734, 0.5634708596431771, 0.8505569717104301], [0.3460488523902999, 0.464769595519293, 0.6676839675477276], [0.8544063246675081, 0.5041190233407755, 0.9081217697141578], [0.9207009090747208, 0.2403865944739051, 0.05375410999863772], [0.6515786136947107, 0.6299918449948327, 0.45292029442034387], [0.986174217295693, 0.2424849846977214, 0.3981993323108266], [0.22101915872994693, 0.3408589198278038, 0.006381420347677524], [0.3159785813515982, 0.1145748921741011, 0.595754317197274], [0.10263421488052715, 0.5864139253490858, 0.23908000741142432], [0.8272999391532938, 0.6123527260897751, 0.3365197327803193], [0.5269583712937912, 0.25668929554516506, 0.7888411215078127], [0.2433880265410031, 0.7240751234287827, 0.8483215810528648], [0.7254601709704898, 0.8316525547295984, 0.9325253855921963], [0.5574483824856672, 0.2935331727879944, 0.6594839453793155], [0.6209642371433579, 0.054030693198821256, 0.5080873988178534], [0.9055507077365624, 0.12865888619203514, 0.9309191861440005], [0.9914469722960537, 0.3074114506206205, 0.8762107657323488], [0.4812682518247371, 0.15055826298548158, 0.9656340505308308], [0.6459219454316445, 0.9144794010251625, 0.751338812155106], [0.860840174209798, 0.8844626353077639, 0.3604624506769899], [0.8194991672032272, 0.926399617787601, 0.8059222327343247], [0.6540413175393658, 0.04579445254618297, 0.26891917826531275], [0.37778835833987046, 0.36247927666109536, 0.7989799305827889], [0.22738304978177726, 0.9038018263773739, 0.6970838854138303], [0.6362015495896184, 0.527680794236961, 0.5570915425178721], [0.6436401915860954, 0.6316925317144524, 0.9137151236993912], [0.04161828388587163, 0.3832413349082706, 0.6880829921949752], [0.7768167825719299, 0.8933821497682587, 0.7221278391266809], [0.8632760876301346, 0.3278628094906323, 0.8421587587114462], [0.8556499133262127, 0.6497385872901932, 0.5436895688477963], [0.9861940318610894, 0.03562313777386272, 0.9183454677106616], [0.8042586091176366, 0.6167222703170994, 0.24181981557207644], [0.9504247117633057, 0.3454233714011461, 0.6883727005547743], [0.9611909135491202, 0.46384154263898114, 0.32700443315058914], [0.523542176970206, 0.446222414615845, 0.9067402987747814], [0.7536954008682911, 0.6675512338797588, 0.22538238957839196], [0.1554052265688285, 0.05746097492966129, 0.8580358872587424], [0.8540838640971405, 0.9165504335482566, 0.6806982829158964], [0.7065090319405029, 0.8683059983962002, 0.05167128320624026], [0.39134812961899124, 0.8910075505622979, 0.7639815712623922], [0.1578117311479783, 0.20047326898284668, 0.9220177338840568], [0.2017488993096358, 0.6949259970936679, 0.8729196864798128], [0.5591089340651949, 0.15576770423813258, 0.1469857469387812], [0.14510398622626974, 0.24451497734532168, 0.46574271993578786], [0.13286397822351492, 0.4178244533944635, 0.03728728952131943], [0.556463206310225, 0.14027595183361663, 0.2731537988657907], [0.4093837966398032, 0.8015225687789814, 0.8033567296903834], [0.527442563956637, 0.902232617214431, 0.7066626674362227], [0.9058355503297827, 0.34983989180213004, 0.8353262183839384], [0.7108382186953104, 0.08591307895133471, 0.21434688012521974], [0.22757345065207668, 0.7943075496583976, 0.2992305547627421], [0.20454109788173636, 0.8251670332103687, 0.012981987094547232], [0.7672562637297392, 0.005429019973062554, 0.022163616037108702], [0.37487345910117564, 0.5086240194440863, 0.9061216063654387], [0.9878004014101087, 0.006345852772772331, 0.17499753379350858], [0.030061528704491303, 0.1409704315546606, 0.3337131835834506], [0.5022506782611504, 0.5448435505388706, 0.40584238936140726], [0.39560774627423445, 0.8905943695833262, 0.5850815030921116], [0.058615671926786406, 0.5365713844300387, 0.1620457551256279], [0.41843842882069693, 0.1536005983609976, 0.3127878501592438], [0.05947621790155899, 0.5412421167331932, 0.2611322146455659], [0.5196159938235607, 0.7066461551682705, 0.970261497412556], [0.30443031606149007, 0.45158581060034975, 0.4331841153149706], [0.8848298403933996, 0.7241791700943656, 0.8917110054596072], [0.5720260591898779, 0.3072801598203052, 0.8891066705989902], [0.13964015336177327, 0.2531778096760302, 0.5703756837403124], [0.2156307542329836, 0.4139947500641685, 0.87051676884144], [0.10800455881891169, 0.05554646035458266, 0.2947027428551443], [0.35198009410633857, 0.365849666213808, 0.06525787683513773], [0.5223264108118847, 0.9032195574351178, 0.28579084943315025], [0.7607724246546966, 0.3087194381828555, 0.6253235528354899], [0.5060485442077824, 0.19173600467625274, 0.9931175692203702], [0.5131805830323746, 0.07719515392040577, 0.923212006754969], [0.3629762141280106, 0.02429179642710888, 0.6963754952399983], [0.7542592485456767, 0.6478893299494212, 0.3424965345400731], [0.49944574453364454, 0.6775665366832825, 0.33758796076989583], [0.010621818120767679, 0.8221571611173205, 0.5186257457566332], [0.5857910304290109, 0.7178133992025467, 0.9729243483606071], [0.16987399482717613, 0.9942570210657463, 0.18120758122552927], [0.016362572521240848, 0.17582788603087263, 0.7255176922640298], [0.10981764283706419, 0.9078582203470377, 0.7638063718334003], [0.9252097840441119, 0.3330197086990039, 0.27888705301420136], [0.12769972651171546, 0.11121470804891687, 0.12710743734391716], [0.5753520518360334, 0.2763862879599456, 0.6115636613363361]] 9 | 10 | 11 | class TrackVisualizer(Visualizer): 12 | def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): 13 | super().__init__( 14 | img_rgb, metadata=metadata, scale=scale, instance_mode=instance_mode 15 | ) 16 | self.cpu_device = torch.device("cpu") 17 | 18 | def _jitter(self, color, id): 19 | """ 20 | Randomly modifies given color to produce a slightly different color than the color given. 21 | 22 | Args: 23 | color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color 24 | picked. The values in the list are in the [0.0, 1.0] range. 25 | 26 | Returns: 27 | jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the 28 | color after being jittered. The values in the list are in the [0.0, 1.0] range. 29 | """ 30 | color = mplc.to_rgb(color) 31 | vec = _ID_JITTERS[id] 32 | # better to do it in another color space 33 | vec = vec / np.linalg.norm(vec) * 0.5 34 | res = np.clip(vec + color, 0, 1) 35 | return tuple(res) 36 | 37 | def draw_instance_predictions(self, predictions): 38 | """ 39 | Draw instance-level prediction results on an image. 40 | 41 | Args: 42 | predictions (Instances): the output of an instance detection/segmentation 43 | model. Following fields will be used to draw: 44 | "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). 45 | 46 | Returns: 47 | output (VisImage): image object with visualizations. 48 | """ 49 | preds = predictions.to(self.cpu_device) 50 | 51 | boxes = preds.pred_boxes if preds.has("pred_boxes") else None 52 | scores = preds.scores if preds.has("scores") else None 53 | classes = preds.pred_classes if preds.has("pred_classes") else None 54 | labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) 55 | labels = None 56 | # print('come here=====visualizer.py') 57 | if labels is not None: 58 | labels = ["[{}] ".format(_id) + l for _id, l in enumerate(labels)] 59 | 60 | if preds.has("pred_masks"): 61 | masks = np.asarray(preds.pred_masks) 62 | #masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] 63 | else: 64 | masks = None 65 | 66 | if classes is None: 67 | return self.output 68 | 69 | colors = [ 70 | self._jitter([x / 255 for x in self.metadata.thing_colors[c]], id) for id, c in enumerate(classes) 71 | ] 72 | alpha = 0.5 73 | 74 | if self._instance_mode == ColorMode.IMAGE_BW: 75 | self.output.img = self._create_grayscale_image( 76 | (preds.pred_masks.any(dim=0) > 0).numpy() 77 | if preds.has("pred_masks") 78 | else None 79 | ) 80 | alpha = 0.3 81 | 82 | self.overlay_instances( 83 | masks=masks, 84 | boxes=boxes, 85 | labels=labels, 86 | assigned_colors=colors, 87 | alpha=alpha, 88 | ) 89 | 90 | return self.output 91 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SysCV/vmt/b0973e8609885d9ac83f803a6b51b00a30a7f564/util/__init__.py -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | """ 7 | Utilities for bounding box manipulation and GIoU. 8 | """ 9 | import torch 10 | from torchvision.ops.boxes import box_area 11 | 12 | 13 | def box_cxcywh_to_xyxy(x): 14 | # print('box:\n', x) 15 | 16 | x_c, y_c, w, h = x.unbind(-1) 17 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 18 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 19 | return torch.stack(b, dim=-1) 20 | 21 | 22 | def box_xyxy_to_cxcywh(x): 23 | x0, y0, x1, y1 = x.unbind(-1) 24 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 25 | (x1 - x0), (y1 - y0)] 26 | return torch.stack(b, dim=-1) 27 | 28 | 29 | # modified from torchvision to also return the union 30 | def box_iou(boxes1, boxes2): 31 | area1 = box_area(boxes1) 32 | area2 = box_area(boxes2) 33 | 34 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 35 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 36 | 37 | wh = (rb - lt).clamp(min=0) # [N,M,2] 38 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 39 | 40 | union = area1[:, None] + area2 - inter 41 | 42 | iou = inter / union 43 | return iou, union 44 | 45 | def multi_box_iou(boxes1, boxes2): 46 | area1 = box_area(boxes1.flatten(0,1)).reshape(boxes1.shape[0], boxes1.shape[1]) 47 | area2 = box_area(boxes2.flatten(0,1)).reshape(boxes2.shape[0], boxes2.shape[1]) 48 | 49 | lt = torch.max(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) # [nf,N,M,2] 50 | rb = torch.min(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) # [nf,N,M,2] 51 | 52 | wh = (rb - lt).clamp(min=0) # [nf,N,M,2] 53 | inter = wh[:, :, :, 0] * wh[:, :, :, 1] # [nf,N,M] 54 | 55 | union = area1[:, :, None] + area2[:, None, :] - inter 56 | 57 | iou = inter / union 58 | return iou, union 59 | 60 | def generalized_box_iou(boxes1, boxes2): 61 | """ 62 | Generalized IoU from https://giou.stanford.edu/ 63 | 64 | The boxes should be in [x0, y0, x1, y1] format 65 | 66 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 67 | and M = len(boxes2) 68 | """ 69 | # degenerate boxes gives inf / nan results 70 | # so do an early check 71 | 72 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 73 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 74 | iou, union = box_iou(boxes1, boxes2) 75 | 76 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 77 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 78 | 79 | wh = (rb - lt).clamp(min=0) # [N,M,2] 80 | area = wh[:, :, 0] * wh[:, :, 1] 81 | 82 | return iou - (area - union) / area 83 | 84 | 85 | def generalized_multi_box_iou(boxes1, boxes2): 86 | """ 87 | Generalized IoU from https://giou.stanford.edu/ 88 | 89 | The boxes should be in [x0, y0, x1, y1] format 90 | boxes1.shape = [nf, N, 4] 91 | boxes2.shape = [nf, M, 4] 92 | Returns a [nf, N, M] pairwise matrix, where N = boxes1.shape[1] 93 | and M = boxes2.shape[1] 94 | """ 95 | # degenerate boxes gives inf / nan results 96 | # so do an early check 97 | 98 | assert (boxes1[:, :, 2:] >= boxes1[:, :, :2]).all() 99 | assert (boxes2[:, :, 2:] >= boxes2[:, :, :2]).all() 100 | iou, union = multi_box_iou(boxes1, boxes2) 101 | 102 | lt = torch.min(boxes1[:, :, None, :2], boxes2[:, None, :, :2]) 103 | rb = torch.max(boxes1[:, :, None, 2:], boxes2[:, None, :, 2:]) 104 | 105 | wh = (rb - lt).clamp(min=0) # [nf,N,M,2] 106 | area = wh[:, :, :, 0] * wh[:, :, :, 1] 107 | 108 | 109 | return iou - (area - union) / (area + 1e-7) 110 | 111 | 112 | def masks_to_boxes(masks): 113 | """Compute the bounding boxes around the provided masks 114 | 115 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 116 | 117 | Returns a [N, 4] tensors, with the boxes in xyxy format 118 | """ 119 | if masks.numel() == 0: 120 | return torch.zeros((0, 4), device=masks.device) 121 | 122 | h, w = masks.shape[-2:] 123 | 124 | y = torch.arange(0, h, dtype=torch.float, device=masks.device) 125 | x = torch.arange(0, w, dtype=torch.float, device=masks.device) 126 | y, x = torch.meshgrid(y, x) 127 | 128 | x_mask = (masks * x.unsqueeze(0)) 129 | x_max = x_mask.flatten(1).max(-1)[0] 130 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 131 | 132 | y_mask = (masks * y.unsqueeze(0)) 133 | y_max = y_mask.flatten(1).max(-1)[0] 134 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 135 | 136 | return torch.stack([x_min, y_min, x_max, y_max], 1) 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | """ 6 | Plotting utilities to visualize training logs. 7 | """ 8 | import torch 9 | import pandas as pd 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | from pathlib import Path, PurePath 14 | 15 | 16 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 17 | ''' 18 | Function to plot specific fields from training log(s). Plots both training and test results. 19 | 20 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 21 | - fields = which results to plot from each log file - plots both training and test for each field. 22 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 23 | - log_name = optional, name of log file if different than default 'log.txt'. 24 | 25 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 26 | - solid lines are training results, dashed lines are test results. 27 | 28 | ''' 29 | func_name = "plot_utils.py::plot_logs" 30 | 31 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 32 | # convert single Path to list to avoid 'not iterable' error 33 | 34 | if not isinstance(logs, list): 35 | if isinstance(logs, PurePath): 36 | logs = [logs] 37 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 38 | else: 39 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 40 | Expect list[Path] or single Path obj, received {type(logs)}") 41 | 42 | # verify valid dir(s) and that every item in list is Path object 43 | for i, dir in enumerate(logs): 44 | if not isinstance(dir, PurePath): 45 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 46 | if dir.exists(): 47 | continue 48 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 49 | 50 | # load log file(s) and plot 51 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 52 | 53 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 54 | 55 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 56 | for j, field in enumerate(fields): 57 | if field == 'mAP': 58 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 59 | axs[j].plot(coco_eval, c=color) 60 | else: 61 | df.interpolate().ewm(com=ewm_col).mean().plot( 62 | y=[f'train_{field}', f'test_{field}'], 63 | ax=axs[j], 64 | color=[color] * 2, 65 | style=['-', '--'] 66 | ) 67 | for ax, field in zip(axs, fields): 68 | ax.legend([Path(p).name for p in logs]) 69 | ax.set_title(field) 70 | 71 | 72 | def plot_precision_recall(files, naming_scheme='iter'): 73 | if naming_scheme == 'exp_id': 74 | # name becomes exp_id 75 | names = [f.parts[-3] for f in files] 76 | elif naming_scheme == 'iter': 77 | names = [f.stem for f in files] 78 | else: 79 | raise ValueError(f'not supported {naming_scheme}') 80 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 81 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 82 | data = torch.load(f) 83 | # precision is n_iou, n_points, n_cat, n_area, max_det 84 | precision = data['precision'] 85 | recall = data['params'].recThrs 86 | scores = data['scores'] 87 | # take precision for all classes, all areas and 100 detections 88 | precision = precision[0, :, :, 0, -1].mean(1) 89 | scores = scores[0, :, :, 0, -1].mean(1) 90 | prec = precision.mean() 91 | rec = data['recall'][0, :, 0, -1].mean() 92 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 93 | f'score={scores.mean():0.3f}, ' + 94 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 95 | ) 96 | axs[0].plot(recall, precision, c=color) 97 | axs[1].plot(recall, scores, c=color) 98 | 99 | axs[0].set_title('Precision / Recall') 100 | axs[0].legend(names) 101 | axs[1].set_title('Scores / Recall') 102 | axs[1].legend(names) 103 | return fig, axs 104 | 105 | 106 | 107 | --------------------------------------------------------------------------------