├── .gitattributes ├── README.md ├── assets ├── demo.gif ├── demo.mp4 ├── demo_ann.gif └── pipeline.png ├── colorize_poisson.py ├── common.py ├── depth ├── .gitignore ├── .layers.py.swp ├── LICENSE ├── README.md ├── datasets │ ├── __init__.py │ ├── kitti_dataset.py │ └── mono_dataset.py ├── depth_prediction_example.ipynb ├── evaluate_depth.py ├── evaluate_pose.py ├── experiments │ ├── mono+stereo_experiments.sh │ ├── mono_experiments.sh │ ├── odom_experiments.sh │ └── stereo_experiments.sh ├── export_gt_depth.py ├── kitti_utils.py ├── layers.py ├── networks │ ├── __init__.py │ ├── depth_decoder.py │ ├── pose_cnn.py │ ├── pose_decoder.py │ └── resnet_encoder.py ├── options.py ├── test_simple.py ├── train.py ├── trainer.py ├── utils.py └── zyz_test.py ├── environment.yml ├── flow ├── .gitignore ├── README.md ├── core │ ├── corr.py │ ├── datasets.py │ ├── extractor.py │ ├── gma.py │ ├── network.py │ ├── update.py │ └── utils │ │ ├── __init__.py │ │ ├── augmentor.py │ │ ├── flow_viz.py │ │ ├── frame_utils.py │ │ └── utils.py ├── demo.sh ├── evaluate.py ├── evaluate.sh ├── evaluate_single.py ├── things_val_test_set.txt ├── train.py └── train.sh ├── functions.py ├── gen.py ├── invert_font_size.py ├── params.py ├── poisson_reconstruct.py ├── prep_scripts ├── floodFill.py ├── predict_depth.m └── run_ucm.m ├── ransac.py ├── requirements.txt ├── segmentation ├── .gitignore ├── ADVANCED_USAGE.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── GETTING_STARTED.md ├── INSTALL.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── cog.yaml ├── configs │ ├── ade20k │ │ ├── instance-segmentation │ │ │ ├── Base-ADE20K-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-ADE20K-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_small_bs16_160k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_160k.yaml │ ├── cityscapes │ │ ├── instance-segmentation │ │ │ ├── Base-Cityscapes-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-Cityscapes-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ ├── coco │ │ ├── instance-segmentation │ │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ │ └── panoptic-segmentation │ │ │ ├── Base-COCO-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ ├── mapillary-vistas │ │ ├── panoptic-segmentation │ │ │ ├── Base-MapillaryVistas-PanopticSegmentation.yaml │ │ │ ├── maskformer_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-MapillaryVistas-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ ├── youtubevis_2019 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml │ └── youtubevis_2021 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml ├── datasets │ ├── README.md │ ├── ade20k_instance_catid_mapping.txt │ ├── prepare_ade20k_ins_seg.py │ ├── prepare_ade20k_pan_seg.py │ ├── prepare_ade20k_sem_seg.py │ └── prepare_coco_semantic_annos_from_panoptic_annos.py ├── demo │ ├── README.md │ ├── demo.py │ └── predictor.py ├── demo_video │ ├── README.md │ ├── demo.py │ ├── predictor.py │ └── visualizer.py ├── mask2former │ ├── __init__.py │ ├── config.py │ ├── data │ │ ├── __init__.py │ │ ├── dataset_mappers │ │ │ ├── __init__.py │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ │ ├── mask_former_instance_dataset_mapper.py │ │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ │ └── mask_former_semantic_dataset_mapper.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── register_ade20k_full.py │ │ │ ├── register_ade20k_instance.py │ │ │ ├── register_ade20k_panoptic.py │ │ │ ├── register_coco_panoptic_annos_semseg.py │ │ │ ├── register_coco_stuff_10k.py │ │ │ ├── register_mapillary_vistas.py │ │ │ └── register_mapillary_vistas_panoptic.py │ ├── evaluation │ │ ├── __init__.py │ │ └── instance_evaluation.py │ ├── maskformer_model.py │ ├── modeling │ │ ├── __init__.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── swin.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ ├── meta_arch │ │ │ ├── __init__.py │ │ │ ├── mask_former_head.py │ │ │ └── per_pixel_baseline.py │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ ├── fpn.py │ │ │ ├── msdeformattn.py │ │ │ └── ops │ │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── mask2former_transformer_decoder.py │ │ │ ├── maskformer_transformer_decoder.py │ │ │ ├── position_encoding.py │ │ │ └── transformer.py │ ├── test_time_augmentation.py │ └── utils │ │ ├── __init__.py │ │ └── misc.py ├── mask2former_video │ ├── __init__.py │ ├── config.py │ ├── data_video │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── build.py │ │ ├── dataset_mapper.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── builtin.py │ │ │ ├── ytvis.py │ │ │ └── ytvis_api │ │ │ │ ├── __init__.py │ │ │ │ ├── ytvos.py │ │ │ │ └── ytvoseval.py │ │ └── ytvis_eval.py │ ├── modeling │ │ ├── __init__.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── position_encoding.py │ │ │ └── video_mask2former_transformer_decoder.py │ ├── utils │ │ ├── __init__.py │ │ └── memory.py │ └── video_maskformer_model.py ├── predict.py ├── requirements.txt ├── tools │ ├── README.md │ ├── analyze_model.py │ ├── convert-pretrained-swin-model-to-d2.py │ ├── convert-torchvision-to-d2.py │ ├── evaluate_coco_boundary_ap.py │ └── evaluate_pq_for_semantic_segmentation.py ├── train_net.py └── train_net_video.py ├── synth_utils.py ├── synthgen.py └── text_utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo.gif -------------------------------------------------------------------------------- /assets/demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo.mp4 -------------------------------------------------------------------------------- /assets/demo_ann.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo_ann.gif -------------------------------------------------------------------------------- /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/pipeline.png -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import signal 3 | from contextlib import contextmanager 4 | 5 | class Color: #pylint: disable=W0232 6 | GRAY=30 7 | RED=31 8 | GREEN=32 9 | YELLOW=33 10 | BLUE=34 11 | MAGENTA=35 12 | CYAN=36 13 | WHITE=37 14 | CRIMSON=38 15 | 16 | def colorize(num, string, bold=False, highlight = False): 17 | assert isinstance(num, int) 18 | attr = [] 19 | if highlight: num += 10 20 | attr.append(str(num)) 21 | if bold: attr.append('1') 22 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 23 | 24 | def colorprint(colorcode, text, o=sys.stdout, bold=False): 25 | o.write(colorize(colorcode, text, bold=bold)) 26 | 27 | def warn(msg): 28 | print (colorize(Color.YELLOW, msg)) 29 | 30 | def error(msg): 31 | print (colorize(Color.RED, msg)) 32 | 33 | # http://stackoverflow.com/questions/366682/how-to-limit-execution-time-of-a-function-call-in-python 34 | class TimeoutException(Exception): pass 35 | @contextmanager 36 | def time_limit(seconds): 37 | def signal_handler(signum, frame): 38 | raise TimeoutException(colorize(Color.RED, " *** Timed out!", highlight=True)) 39 | signal.signal(signal.SIGALRM, signal_handler) 40 | signal.alarm(seconds) 41 | try: 42 | yield 43 | finally: 44 | signal.alarm(0) 45 | -------------------------------------------------------------------------------- /depth/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *_disp.jpg 4 | *_disp.npy 5 | *.npz 6 | kitti_data 7 | models 8 | -------------------------------------------------------------------------------- /depth/.layers.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/depth/.layers.py.swp -------------------------------------------------------------------------------- /depth/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .kitti_dataset import KITTIRAWDataset, KITTIOdomDataset, KITTIDepthDataset 2 | -------------------------------------------------------------------------------- /depth/experiments/mono+stereo_experiments.sh: -------------------------------------------------------------------------------- 1 | # Our standard mono+stereo model 2 | python ../train.py --model_name MS_640x192 \ 3 | --use_stereo --frame_ids 0 -1 1 4 | 5 | # Our low resolution mono+stereo model 6 | python ../train.py --model_name MS_416x128 \ 7 | --use_stereo --frame_ids 0 -1 1 \ 8 | --height 128 --width 416 9 | 10 | # Our high resolution mono+stereo model 11 | python ../train.py --model_name MS_1024x320 \ 12 | --use_stereo --frame_ids 0 -1 1 \ 13 | --height 320 --width 1024 \ 14 | --load_weights_folder ~/tmp/MS_640x192/models/weights_9 \ 15 | --num_epochs 5 --learning_rate 1e-5 16 | 17 | # Our standard mono+stereo model w/o pretraining 18 | python ../train.py --model_name MS_640x192_no_pt \ 19 | --use_stereo --frame_ids 0 -1 1 \ 20 | --weights_init scratch \ 21 | --num_epochs 30 22 | 23 | # Baseline mono+stereo model, i.e. ours with our contributions turned off 24 | python ../train.py --model_name MS_640x192_baseline \ 25 | --use_stereo --frame_ids 0 -1 1 \ 26 | --v1_multiscale --disable_automasking --avg_reprojection 27 | 28 | # Mono+stereo without full-res multiscale 29 | python ../train.py --model_name MS_640x192_no_full_res_ms \ 30 | --use_stereo --frame_ids 0 -1 1 \ 31 | --v1_multiscale 32 | 33 | # Mono+stereo without automasking 34 | python ../train.py --model_name MS_640x192_no_automasking \ 35 | --use_stereo --frame_ids 0 -1 1 \ 36 | --disable_automasking 37 | 38 | # Mono+stereo without min reproj 39 | python ../train.py --model_name MS_640x192_no_min_reproj \ 40 | --use_stereo --frame_ids 0 -1 1 \ 41 | --avg_reprojection 42 | -------------------------------------------------------------------------------- /depth/experiments/mono_experiments.sh: -------------------------------------------------------------------------------- 1 | # Our standard mono model 2 | python ../train.py --model_name M_640x192 3 | 4 | # Our low resolution mono model 5 | python ../train.py --model_name M_416x128 \ 6 | --height 128 --width 416 7 | 8 | # Our high resolution mono model 9 | python ../train.py --model_name M_1024x320 \ 10 | --height 320 --width 1024 \ 11 | --load_weights_folder ~/tmp/M_640x192/models/weights_9 \ 12 | --num_epochs 5 --learning_rate 1e-5 13 | 14 | # Our standard mono model w/o pretraining 15 | python ../train.py --model_name M_640x192_no_pt \ 16 | --weights_init scratch \ 17 | --num_epochs 30 18 | 19 | # Baseline mono model, i.e. ours with our contributions turned off 20 | python ../train.py --model_name M_640x192_baseline \ 21 | --v1_multiscale --disable_automasking --avg_reprojection 22 | 23 | # Mono without full-res multiscale 24 | python ../train.py --model_name M_640x192_no_full_res_ms \ 25 | --v1_multiscale 26 | 27 | # Mono without automasking 28 | python ../train.py --model_name M_640x192_no_automasking \ 29 | --disable_automasking 30 | 31 | # Mono without min reproj 32 | python ../train.py --model_name M_640x192_no_min_reproj \ 33 | --avg_reprojection 34 | 35 | # Mono with Zhou's masking scheme instead of ours 36 | python ../train.py --model_name M_640x192_zhou_masking \ 37 | --disable_automasking --zhou_mask 38 | -------------------------------------------------------------------------------- /depth/experiments/odom_experiments.sh: -------------------------------------------------------------------------------- 1 | # A different kitti dataset is required for odometry training and evaluation. 2 | # This can be downloaded from http://www.cvlibs.net/datasets/kitti/eval_odometry.php 3 | # We assume this has been extraced to the folder ../kitti_data_odom 4 | 5 | # Standard mono odometry model. 6 | python ../train.py --model_name M_odom \ 7 | --split odom --dataset kitti_odom --data_path ../kitti_data_odom 8 | 9 | # Mono odometry model without Imagenet pretraining 10 | python ../train.py --model_name M_odom_no_pt \ 11 | --split odom --dataset kitti_odom --data_path ../kitti_data_odom \ 12 | --weights_init scratch --num_epochs 30 13 | 14 | # Mono + stereo odometry model 15 | python ../train.py --model_name MS_odom \ 16 | --split odom --dataset kitti_odom --data_path ../kitti_data_odom \ 17 | --use_stereo 18 | 19 | # Mono + stereo odometry model without Imagenet pretraining 20 | python ../train.py --model_name MS_odom_no_pt \ 21 | --split odom --dataset kitti_odom --data_path ../kitti_data_odom \ 22 | --use_stereo \ 23 | --weights_init scratch --num_epochs 30 24 | -------------------------------------------------------------------------------- /depth/experiments/stereo_experiments.sh: -------------------------------------------------------------------------------- 1 | # Our standard stereo model 2 | python ../train.py --model_name S_640x192 \ 3 | --use_stereo --frame_ids 0 --split eigen_full 4 | 5 | # Our low resolution stereo model 6 | python ../train.py --model_name S_416x128 \ 7 | --use_stereo --frame_ids 0 --split eigen_full \ 8 | --height 128 --width 416 9 | 10 | # Our high resolution stereo model 11 | python ../train.py --model_name S_1024x320 \ 12 | --use_stereo --frame_ids 0 --split eigen_full \ 13 | --height 320 --width 1024 \ 14 | --load_weights_folder ~/tmp/S_640x192/models/weights_9 \ 15 | --models_to_load encoder depth \ 16 | --num_epochs 5 --learning_rate 1e-5 17 | 18 | # Our standard stereo model w/o pretraining 19 | python ../train.py --model_name S_640x192_no_pt \ 20 | --use_stereo --frame_ids 0 --split eigen_full \ 21 | --weights_init scratch \ 22 | --num_epochs 30 23 | 24 | # Baseline stereo model, i.e. ours with our contributions turned off 25 | python ../train.py --model_name S_640x192_baseline \ 26 | --use_stereo --frame_ids 0 --split eigen_full \ 27 | --v1_multiscale --disable_automasking 28 | -------------------------------------------------------------------------------- /depth/export_gt_depth.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import os 10 | 11 | import argparse 12 | import numpy as np 13 | import PIL.Image as pil 14 | 15 | from utils import readlines 16 | from kitti_utils import generate_depth_map 17 | 18 | 19 | def export_gt_depths_kitti(): 20 | 21 | parser = argparse.ArgumentParser(description='export_gt_depth') 22 | 23 | parser.add_argument('--data_path', 24 | type=str, 25 | help='path to the root of the KITTI data', 26 | required=True) 27 | parser.add_argument('--split', 28 | type=str, 29 | help='which split to export gt from', 30 | required=True, 31 | choices=["eigen", "eigen_benchmark"]) 32 | opt = parser.parse_args() 33 | 34 | split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) 35 | lines = readlines(os.path.join(split_folder, "test_files.txt")) 36 | 37 | print("Exporting ground truth depths for {}".format(opt.split)) 38 | 39 | gt_depths = [] 40 | for line in lines: 41 | 42 | folder, frame_id, _ = line.split() 43 | frame_id = int(frame_id) 44 | 45 | if opt.split == "eigen": 46 | calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) 47 | velo_filename = os.path.join(opt.data_path, folder, 48 | "velodyne_points/data", "{:010d}.bin".format(frame_id)) 49 | gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) 50 | elif opt.split == "eigen_benchmark": 51 | gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", 52 | "groundtruth", "image_02", "{:010d}.png".format(frame_id)) 53 | gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256 54 | 55 | gt_depths.append(gt_depth.astype(np.float32)) 56 | 57 | output_path = os.path.join(split_folder, "gt_depths.npz") 58 | 59 | print("Saving to {}".format(opt.split)) 60 | 61 | np.savez_compressed(output_path, data=np.array(gt_depths)) 62 | 63 | 64 | if __name__ == "__main__": 65 | export_gt_depths_kitti() 66 | -------------------------------------------------------------------------------- /depth/kitti_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import numpy as np 5 | from collections import Counter 6 | 7 | 8 | def load_velodyne_points(filename): 9 | """Load 3D point cloud from KITTI file format 10 | (adapted from https://github.com/hunse/kitti) 11 | """ 12 | points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4) 13 | points[:, 3] = 1.0 # homogeneous 14 | return points 15 | 16 | 17 | def read_calib_file(path): 18 | """Read KITTI calibration file 19 | (from https://github.com/hunse/kitti) 20 | """ 21 | float_chars = set("0123456789.e+- ") 22 | data = {} 23 | with open(path, 'r') as f: 24 | for line in f.readlines(): 25 | key, value = line.split(':', 1) 26 | value = value.strip() 27 | data[key] = value 28 | if float_chars.issuperset(value): 29 | # try to cast to float array 30 | try: 31 | data[key] = np.array(list(map(float, value.split(' ')))) 32 | except ValueError: 33 | # casting error: data[key] already eq. value, so pass 34 | pass 35 | 36 | return data 37 | 38 | 39 | def sub2ind(matrixSize, rowSub, colSub): 40 | """Convert row, col matrix subscripts to linear indices 41 | """ 42 | m, n = matrixSize 43 | return rowSub * (n-1) + colSub - 1 44 | 45 | 46 | def generate_depth_map(calib_dir, velo_filename, cam=2, vel_depth=False): 47 | """Generate a depth map from velodyne data 48 | """ 49 | # load calibration files 50 | cam2cam = read_calib_file(os.path.join(calib_dir, 'calib_cam_to_cam.txt')) 51 | velo2cam = read_calib_file(os.path.join(calib_dir, 'calib_velo_to_cam.txt')) 52 | velo2cam = np.hstack((velo2cam['R'].reshape(3, 3), velo2cam['T'][..., np.newaxis])) 53 | velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0]))) 54 | 55 | # get image shape 56 | im_shape = cam2cam["S_rect_02"][::-1].astype(np.int32) 57 | 58 | # compute projection matrix velodyne->image plane 59 | R_cam2rect = np.eye(4) 60 | R_cam2rect[:3, :3] = cam2cam['R_rect_00'].reshape(3, 3) 61 | P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3, 4) 62 | P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam) 63 | 64 | # load velodyne points and remove all behind image plane (approximation) 65 | # each row of the velodyne data is forward, left, up, reflectance 66 | velo = load_velodyne_points(velo_filename) 67 | velo = velo[velo[:, 0] >= 0, :] 68 | 69 | # project the points to the camera 70 | velo_pts_im = np.dot(P_velo2im, velo.T).T 71 | velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis] 72 | 73 | if vel_depth: 74 | velo_pts_im[:, 2] = velo[:, 0] 75 | 76 | # check if in bounds 77 | # use minus 1 to get the exact same value as KITTI matlab code 78 | velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1 79 | velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1 80 | val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0) 81 | val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0]) 82 | velo_pts_im = velo_pts_im[val_inds, :] 83 | 84 | # project to image 85 | depth = np.zeros((im_shape[:2])) 86 | depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2] 87 | 88 | # find the duplicate points and choose the closest depth 89 | inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0]) 90 | dupe_inds = [item for item, count in Counter(inds).items() if count > 1] 91 | for dd in dupe_inds: 92 | pts = np.where(inds == dd)[0] 93 | x_loc = int(velo_pts_im[pts[0], 0]) 94 | y_loc = int(velo_pts_im[pts[0], 1]) 95 | depth[y_loc, x_loc] = velo_pts_im[pts, 2].min() 96 | depth[depth < 0] = 0 97 | 98 | return depth 99 | -------------------------------------------------------------------------------- /depth/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_encoder import ResnetEncoder 2 | from .depth_decoder import DepthDecoder 3 | from .pose_decoder import PoseDecoder 4 | from .pose_cnn import PoseCNN 5 | -------------------------------------------------------------------------------- /depth/networks/depth_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | 13 | from collections import OrderedDict 14 | from layers import * 15 | 16 | 17 | class DepthDecoder(nn.Module): 18 | def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True): 19 | super(DepthDecoder, self).__init__() 20 | 21 | self.num_output_channels = num_output_channels 22 | self.use_skips = use_skips 23 | self.upsample_mode = 'nearest' 24 | self.scales = scales 25 | 26 | self.num_ch_enc = num_ch_enc 27 | self.num_ch_dec = np.array([16, 32, 64, 128, 256]) 28 | 29 | # decoder 30 | self.convs = OrderedDict() 31 | for i in range(4, -1, -1): 32 | # upconv_0 33 | num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] 34 | num_ch_out = self.num_ch_dec[i] 35 | self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) 36 | 37 | # upconv_1 38 | num_ch_in = self.num_ch_dec[i] 39 | if self.use_skips and i > 0: 40 | num_ch_in += self.num_ch_enc[i - 1] 41 | num_ch_out = self.num_ch_dec[i] 42 | self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) 43 | 44 | for s in self.scales: 45 | self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels) 46 | 47 | self.decoder = nn.ModuleList(list(self.convs.values())) 48 | self.sigmoid = nn.Sigmoid() 49 | 50 | def forward(self, input_features): 51 | self.outputs = {} 52 | 53 | # decoder 54 | x = input_features[-1] 55 | for i in range(4, -1, -1): 56 | x = self.convs[("upconv", i, 0)](x) 57 | x = [upsample(x)] 58 | if self.use_skips and i > 0: 59 | x += [input_features[i - 1]] 60 | x = torch.cat(x, 1) 61 | x = self.convs[("upconv", i, 1)](x) 62 | if i in self.scales: 63 | self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x)) 64 | 65 | return self.outputs 66 | -------------------------------------------------------------------------------- /depth/networks/pose_cnn.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | class PoseCNN(nn.Module): 14 | def __init__(self, num_input_frames): 15 | super(PoseCNN, self).__init__() 16 | 17 | self.num_input_frames = num_input_frames 18 | 19 | self.convs = {} 20 | self.convs[0] = nn.Conv2d(3 * num_input_frames, 16, 7, 2, 3) 21 | self.convs[1] = nn.Conv2d(16, 32, 5, 2, 2) 22 | self.convs[2] = nn.Conv2d(32, 64, 3, 2, 1) 23 | self.convs[3] = nn.Conv2d(64, 128, 3, 2, 1) 24 | self.convs[4] = nn.Conv2d(128, 256, 3, 2, 1) 25 | self.convs[5] = nn.Conv2d(256, 256, 3, 2, 1) 26 | self.convs[6] = nn.Conv2d(256, 256, 3, 2, 1) 27 | 28 | self.pose_conv = nn.Conv2d(256, 6 * (num_input_frames - 1), 1) 29 | 30 | self.num_convs = len(self.convs) 31 | 32 | self.relu = nn.ReLU(True) 33 | 34 | self.net = nn.ModuleList(list(self.convs.values())) 35 | 36 | def forward(self, out): 37 | 38 | for i in range(self.num_convs): 39 | out = self.convs[i](out) 40 | out = self.relu(out) 41 | 42 | out = self.pose_conv(out) 43 | out = out.mean(3).mean(2) 44 | 45 | out = 0.01 * out.view(-1, self.num_input_frames - 1, 1, 6) 46 | 47 | axisangle = out[..., :3] 48 | translation = out[..., 3:] 49 | 50 | return axisangle, translation 51 | -------------------------------------------------------------------------------- /depth/networks/pose_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import torch 10 | import torch.nn as nn 11 | from collections import OrderedDict 12 | 13 | 14 | class PoseDecoder(nn.Module): 15 | def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1): 16 | super(PoseDecoder, self).__init__() 17 | 18 | self.num_ch_enc = num_ch_enc 19 | self.num_input_features = num_input_features 20 | 21 | if num_frames_to_predict_for is None: 22 | num_frames_to_predict_for = num_input_features - 1 23 | self.num_frames_to_predict_for = num_frames_to_predict_for 24 | 25 | self.convs = OrderedDict() 26 | self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1) 27 | self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1) 28 | self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1) 29 | self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1) 30 | 31 | self.relu = nn.ReLU() 32 | 33 | self.net = nn.ModuleList(list(self.convs.values())) 34 | 35 | def forward(self, input_features): 36 | last_features = [f[-1] for f in input_features] 37 | 38 | cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features] 39 | cat_features = torch.cat(cat_features, 1) 40 | 41 | out = cat_features 42 | for i in range(3): 43 | out = self.convs[("pose", i)](out) 44 | if i != 2: 45 | out = self.relu(out) 46 | 47 | out = out.mean(3).mean(2) 48 | 49 | out = 0.01 * out.view(-1, self.num_frames_to_predict_for, 1, 6) 50 | 51 | axisangle = out[..., :3] 52 | translation = out[..., 3:] 53 | 54 | return axisangle, translation 55 | -------------------------------------------------------------------------------- /depth/networks/resnet_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import numpy as np 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torchvision.models as models 14 | import torch.utils.model_zoo as model_zoo 15 | 16 | 17 | class ResNetMultiImageInput(models.ResNet): 18 | """Constructs a resnet model with varying number of input images. 19 | Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py 20 | """ 21 | def __init__(self, block, layers, num_classes=1000, num_input_images=1): 22 | super(ResNetMultiImageInput, self).__init__(block, layers) 23 | self.inplanes = 64 24 | self.conv1 = nn.Conv2d( 25 | num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 26 | self.bn1 = nn.BatchNorm2d(64) 27 | self.relu = nn.ReLU(inplace=True) 28 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 29 | self.layer1 = self._make_layer(block, 64, layers[0]) 30 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 31 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 32 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 33 | 34 | for m in self.modules(): 35 | if isinstance(m, nn.Conv2d): 36 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 37 | elif isinstance(m, nn.BatchNorm2d): 38 | nn.init.constant_(m.weight, 1) 39 | nn.init.constant_(m.bias, 0) 40 | 41 | 42 | def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1): 43 | """Constructs a ResNet model. 44 | Args: 45 | num_layers (int): Number of resnet layers. Must be 18 or 50 46 | pretrained (bool): If True, returns a model pre-trained on ImageNet 47 | num_input_images (int): Number of frames stacked as input 48 | """ 49 | assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet" 50 | blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers] 51 | block_type = {18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck}[num_layers] 52 | model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images) 53 | 54 | if pretrained: 55 | loaded = model_zoo.load_url(models.resnet.model_urls['resnet{}'.format(num_layers)]) 56 | loaded['conv1.weight'] = torch.cat( 57 | [loaded['conv1.weight']] * num_input_images, 1) / num_input_images 58 | model.load_state_dict(loaded) 59 | return model 60 | 61 | 62 | class ResnetEncoder(nn.Module): 63 | """Pytorch module for a resnet encoder 64 | """ 65 | def __init__(self, num_layers, pretrained, num_input_images=1): 66 | super(ResnetEncoder, self).__init__() 67 | 68 | self.num_ch_enc = np.array([64, 64, 128, 256, 512]) 69 | 70 | resnets = {18: models.resnet18, 71 | 34: models.resnet34, 72 | 50: models.resnet50, 73 | 101: models.resnet101, 74 | 152: models.resnet152} 75 | 76 | if num_layers not in resnets: 77 | raise ValueError("{} is not a valid number of resnet layers".format(num_layers)) 78 | 79 | if num_input_images > 1: 80 | self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images) 81 | else: 82 | self.encoder = resnets[num_layers](pretrained) 83 | 84 | if num_layers > 34: 85 | self.num_ch_enc[1:] *= 4 86 | 87 | def forward(self, input_image): 88 | self.features = [] 89 | x = (input_image - 0.45) / 0.225 90 | x = self.encoder.conv1(x) 91 | x = self.encoder.bn1(x) 92 | self.features.append(self.encoder.relu(x)) 93 | self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1]))) 94 | self.features.append(self.encoder.layer2(self.features[-1])) 95 | self.features.append(self.encoder.layer3(self.features[-1])) 96 | self.features.append(self.encoder.layer4(self.features[-1])) 97 | 98 | return self.features 99 | -------------------------------------------------------------------------------- /depth/train.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | from trainer import Trainer 10 | from options import MonodepthOptions 11 | 12 | options = MonodepthOptions() 13 | opts = options.parse() 14 | 15 | 16 | if __name__ == "__main__": 17 | trainer = Trainer(opts) 18 | trainer.train() 19 | -------------------------------------------------------------------------------- /depth/zyz_test.py: -------------------------------------------------------------------------------- 1 | image = 'assets/test_image.jpg' 2 | depth = 'assets/test_image_disp.npy' 3 | 4 | import cv2 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | if __name__ == '__main__': 9 | 10 | img = np.load(image) 11 | dp = np.load(depth) 12 | 13 | print(img) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: flowtext 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - ca-certificates=2023.01.10=h06a4308_0 8 | - ld_impl_linux-64=2.38=h1181459_1 9 | - libffi=3.4.2=h6a678d5_6 10 | - libgcc-ng=11.2.0=h1234567_1 11 | - libgomp=11.2.0=h1234567_1 12 | - libstdcxx-ng=11.2.0=h1234567_1 13 | - ncurses=6.4=h6a678d5_0 14 | - openssl=1.1.1t=h7f8727e_0 15 | - pip=23.0.1=py38h06a4308_0 16 | - python=3.8.16=h7a1cb2a_3 17 | - readline=8.2=h5eee18b_0 18 | - setuptools=66.0.0=py38h06a4308_0 19 | - sqlite=3.41.2=h5eee18b_0 20 | - tk=8.6.12=h1ccaba5_0 21 | - wheel=0.38.4=py38h06a4308_0 22 | - xz=5.2.10=h5eee18b_1 23 | - zlib=1.2.13=h5eee18b_0 24 | - pip: 25 | - absl-py==1.4.0 26 | - antlr4-python3-runtime==4.9.3 27 | - black==23.3.0 28 | - cachetools==5.3.0 29 | - certifi==2022.12.7 30 | - charset-normalizer==3.1.0 31 | - click==8.1.3 32 | - cloudpickle==2.2.1 33 | - contourpy==1.0.7 34 | - cycler==0.11.0 35 | - cython==0.29.34 36 | - einops==0.6.1 37 | - filelock==3.12.0 38 | - fonttools==4.39.3 39 | - fsspec==2023.4.0 40 | - fvcore==0.1.5.post20221221 41 | - google-auth==2.17.3 42 | - google-auth-oauthlib==1.0.0 43 | - grpcio==1.54.0 44 | - h5py==3.8.0 45 | - huggingface-hub==0.14.1 46 | - hydra-core==1.3.2 47 | - idna==3.4 48 | - imageio==2.28.1 49 | - importlib-metadata==6.6.0 50 | - importlib-resources==5.12.0 51 | - iopath==0.1.9 52 | - kiwisolver==1.4.4 53 | - lazy-loader==0.2 54 | - markdown==3.4.3 55 | - markupsafe==2.1.2 56 | - matplotlib==3.7.1 57 | - multiscaledeformableattention==1.0 58 | - mypy-extensions==1.0.0 59 | - networkx==3.1 60 | - numpy==1.24.3 61 | - oauthlib==3.2.2 62 | - omegaconf==2.3.0 63 | - opencv-python==4.7.0.72 64 | - packaging==23.1 65 | - pathspec==0.11.1 66 | - pillow==9.5.0 67 | - platformdirs==3.5.0 68 | - portalocker==2.7.0 69 | - protobuf==4.22.3 70 | - pyasn1==0.5.0 71 | - pyasn1-modules==0.3.0 72 | - pycocotools==2.0.6 73 | - pygame==2.0.0 74 | - pyparsing==3.0.9 75 | - python-dateutil==2.8.2 76 | - pywavelets==1.4.1 77 | - pyyaml==6.0 78 | - requests==2.29.0 79 | - requests-oauthlib==1.3.1 80 | - rsa==4.9 81 | - scikit-image==0.20.0 82 | - scipy==1.9.1 83 | - shapely==2.0.1 84 | - six==1.16.0 85 | - submitit==1.4.5 86 | - tabulate==0.9.0 87 | - tensorboard==2.12.2 88 | - tensorboard-data-server==0.7.0 89 | - tensorboard-plugin-wit==1.8.1 90 | - termcolor==2.3.0 91 | - tifffile==2023.4.12 92 | - timm==0.6.13 93 | - tomli==2.0.1 94 | - torch==1.9.0+cu111 95 | - torchaudio==0.9.0 96 | - torchvision==0.10.0+cu111 97 | - tqdm==4.65.0 98 | - typing-extensions==4.5.0 99 | - urllib3==1.26.15 100 | - werkzeug==2.3.3 101 | - wget==3.2 102 | - yacs==0.1.8 103 | - zipp==3.15.0 104 | prefix: /home/zyz/anaconda3/envs/flowtext 105 | -------------------------------------------------------------------------------- /flow/.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__ 2 | .idea 3 | results 4 | -------------------------------------------------------------------------------- /flow/README.md: -------------------------------------------------------------------------------- 1 | # Learning to Estimate Hidden Motions with Global Motion Aggregation 2 | This repository contains the source code for our paper: 3 | 4 | [Learning to Estimate Hidden Motions with Global Motion Aggregation](https://arxiv.org/abs/2104.02409)
5 | ICCV 2021
6 | **Shihao Jiang**, Dylan Campbell, Yao Lu, Hongdong Li, Richard Hartley
7 | ANU, Oxford
8 | 9 | ## Environments 10 | You will have to choose cudatoolkit version to match your compute environment. 11 | The code is tested on PyTorch 1.8.0 but other versions might also work. 12 | ```Shell 13 | conda create --name gma python==3.7 14 | conda activate gma 15 | conda install pytorch=1.8.0 torchvision=0.9.0 cudatoolkit=11.1 -c pytorch -c conda-forge 16 | pip install matplotlib imageio einops scipy opencv-python 17 | ``` 18 | ## Demo 19 | ```Shell 20 | sh demo.sh 21 | ``` 22 | ## Train 23 | ```Shell 24 | sh train.sh 25 | ``` 26 | ## Evaluate 27 | ```Shell 28 | sh evaluate.sh 29 | ``` 30 | ## License 31 | WTFPL. See [LICENSE](LICENSE) file. 32 | 33 | ## Acknowledgement 34 | The overall code framework is adapted from [RAFT](https://github.com/princeton-vl/RAFT). We 35 | thank the authors for the contribution. We also thank [Phil Wang](https://github.com/lucidrains) 36 | for open-sourcing transformer implementations. 37 | -------------------------------------------------------------------------------- /flow/core/corr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | from utils.utils import bilinear_sampler, coords_grid 6 | # from compute_sparse_correlation import compute_sparse_corr, compute_sparse_corr_torch, compute_sparse_corr_mink 7 | 8 | try: 9 | import alt_cuda_corr 10 | except: 11 | # alt_cuda_corr is not compiled 12 | pass 13 | 14 | 15 | class CorrBlock: 16 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 17 | self.num_levels = num_levels 18 | self.radius = radius 19 | self.corr_pyramid = [] 20 | 21 | # all pairs correlation 22 | corr = CorrBlock.corr(fmap1, fmap2) 23 | 24 | batch, h1, w1, dim, h2, w2 = corr.shape 25 | corr = corr.reshape(batch * h1 * w1, dim, h2, w2) 26 | 27 | self.corr_pyramid.append(corr) 28 | for i in range(self.num_levels - 1): 29 | corr = F.avg_pool2d(corr, 2, stride=2) 30 | self.corr_pyramid.append(corr) 31 | 32 | def __call__(self, coords): 33 | r = self.radius 34 | coords = coords.permute(0, 2, 3, 1) 35 | batch, h1, w1, _ = coords.shape 36 | 37 | out_pyramid = [] 38 | for i in range(self.num_levels): 39 | corr = self.corr_pyramid[i] 40 | dx = torch.linspace(-r, r, 2 * r + 1) 41 | dy = torch.linspace(-r, r, 2 * r + 1) 42 | delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) 43 | 44 | centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2 ** i 45 | delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2) 46 | coords_lvl = centroid_lvl + delta_lvl 47 | 48 | corr = bilinear_sampler(corr, coords_lvl) 49 | corr = corr.view(batch, h1, w1, -1) 50 | out_pyramid.append(corr) 51 | 52 | out = torch.cat(out_pyramid, dim=-1) 53 | return out.permute(0, 3, 1, 2).contiguous().float() 54 | 55 | @staticmethod 56 | def corr(fmap1, fmap2): 57 | batch, dim, ht, wd = fmap1.shape 58 | fmap1 = fmap1.view(batch, dim, ht * wd) 59 | fmap2 = fmap2.view(batch, dim, ht * wd) 60 | 61 | corr = torch.matmul(fmap1.transpose(1, 2), fmap2) 62 | corr = corr.view(batch, ht, wd, 1, ht, wd) 63 | return corr / torch.sqrt(torch.tensor(dim).float()) 64 | 65 | 66 | class CorrBlockSingleScale(nn.Module): 67 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 68 | super().__init__() 69 | self.radius = radius 70 | 71 | # all pairs correlation 72 | corr = CorrBlock.corr(fmap1, fmap2) 73 | batch, h1, w1, dim, h2, w2 = corr.shape 74 | self.corr = corr.reshape(batch * h1 * w1, dim, h2, w2) 75 | 76 | def __call__(self, coords): 77 | r = self.radius 78 | coords = coords.permute(0, 2, 3, 1) 79 | batch, h1, w1, _ = coords.shape 80 | 81 | corr = self.corr 82 | dx = torch.linspace(-r, r, 2 * r + 1) 83 | dy = torch.linspace(-r, r, 2 * r + 1) 84 | delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) 85 | 86 | centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) 87 | delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2) 88 | coords_lvl = centroid_lvl + delta_lvl 89 | 90 | corr = bilinear_sampler(corr, coords_lvl) 91 | out = corr.view(batch, h1, w1, -1) 92 | out = out.permute(0, 3, 1, 2).contiguous().float() 93 | return out 94 | 95 | @staticmethod 96 | def corr(fmap1, fmap2): 97 | batch, dim, ht, wd = fmap1.shape 98 | fmap1 = fmap1.view(batch, dim, ht * wd) 99 | fmap2 = fmap2.view(batch, dim, ht * wd) 100 | 101 | corr = torch.matmul(fmap1.transpose(1, 2), fmap2) 102 | corr = corr.view(batch, ht, wd, 1, ht, wd) 103 | return corr / torch.sqrt(torch.tensor(dim).float()) 104 | -------------------------------------------------------------------------------- /flow/core/gma.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, einsum 3 | from einops import rearrange 4 | 5 | 6 | class RelPosEmb(nn.Module): 7 | def __init__( 8 | self, 9 | max_pos_size, 10 | dim_head 11 | ): 12 | super().__init__() 13 | self.rel_height = nn.Embedding(2 * max_pos_size - 1, dim_head) 14 | self.rel_width = nn.Embedding(2 * max_pos_size - 1, dim_head) 15 | 16 | deltas = torch.arange(max_pos_size).view(1, -1) - torch.arange(max_pos_size).view(-1, 1) 17 | rel_ind = deltas + max_pos_size - 1 18 | self.register_buffer('rel_ind', rel_ind) 19 | 20 | def forward(self, q): 21 | batch, heads, h, w, c = q.shape 22 | height_emb = self.rel_height(self.rel_ind[:h, :h].reshape(-1)) 23 | width_emb = self.rel_width(self.rel_ind[:w, :w].reshape(-1)) 24 | 25 | height_emb = rearrange(height_emb, '(x u) d -> x u () d', x=h) 26 | width_emb = rearrange(width_emb, '(y v) d -> y () v d', y=w) 27 | 28 | height_score = einsum('b h x y d, x u v d -> b h x y u v', q, height_emb) 29 | width_score = einsum('b h x y d, y u v d -> b h x y u v', q, width_emb) 30 | 31 | return height_score + width_score 32 | 33 | 34 | class Attention(nn.Module): 35 | def __init__( 36 | self, 37 | *, 38 | args, 39 | dim, 40 | max_pos_size = 100, 41 | heads = 4, 42 | dim_head = 128, 43 | ): 44 | super().__init__() 45 | self.args = args 46 | self.heads = heads 47 | self.scale = dim_head ** -0.5 48 | inner_dim = heads * dim_head 49 | 50 | self.to_qk = nn.Conv2d(dim, inner_dim * 2, 1, bias=False) 51 | 52 | self.pos_emb = RelPosEmb(max_pos_size, dim_head) 53 | 54 | def forward(self, fmap): 55 | heads, b, c, h, w = self.heads, *fmap.shape 56 | 57 | q, k = self.to_qk(fmap).chunk(2, dim=1) 58 | 59 | q, k = map(lambda t: rearrange(t, 'b (h d) x y -> b h x y d', h=heads), (q, k)) 60 | q = self.scale * q 61 | 62 | if self.args.position_only: 63 | sim = self.pos_emb(q) 64 | 65 | elif self.args.position_and_content: 66 | sim_content = einsum('b h x y d, b h u v d -> b h x y u v', q, k) 67 | sim_pos = self.pos_emb(q) 68 | sim = sim_content + sim_pos 69 | 70 | else: 71 | sim = einsum('b h x y d, b h u v d -> b h x y u v', q, k) 72 | 73 | sim = rearrange(sim, 'b h x y u v -> b h (x y) (u v)') 74 | attn = sim.softmax(dim=-1) 75 | 76 | return attn 77 | 78 | 79 | class Aggregate(nn.Module): 80 | def __init__( 81 | self, 82 | args, 83 | dim, 84 | heads = 4, 85 | dim_head = 128, 86 | ): 87 | super().__init__() 88 | self.args = args 89 | self.heads = heads 90 | self.scale = dim_head ** -0.5 91 | inner_dim = heads * dim_head 92 | 93 | self.to_v = nn.Conv2d(dim, inner_dim, 1, bias=False) 94 | 95 | self.gamma = nn.Parameter(torch.zeros(1)) 96 | 97 | if dim != inner_dim: 98 | self.project = nn.Conv2d(inner_dim, dim, 1, bias=False) 99 | else: 100 | self.project = None 101 | 102 | def forward(self, attn, fmap): 103 | heads, b, c, h, w = self.heads, *fmap.shape 104 | 105 | v = self.to_v(fmap) 106 | v = rearrange(v, 'b (h d) x y -> b h (x y) d', h=heads) 107 | out = einsum('b h i j, b h j d -> b h i d', attn, v) 108 | out = rearrange(out, 'b h (x y) d -> b (h d) x y', x=h, y=w) 109 | 110 | if self.project is not None: 111 | out = self.project(out) 112 | 113 | out = fmap + self.gamma * out 114 | 115 | return out 116 | 117 | 118 | if __name__ == "__main__": 119 | import argparse 120 | import numpy as np 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--position_only', default=False, action='store_true', 123 | help='only use position-wise attention') 124 | parser.add_argument('--position_and_content', default=True, action='store_true', 125 | help='use position and content-wise attention') 126 | args = parser.parse_args() 127 | 128 | 129 | model = Attention(args=args, dim=128, heads=1) 130 | arr = np.random.random((3, 128, 46, 96)).astype(np.float32) 131 | input = torch.Tensor(arr) 132 | 133 | output = model(input) 134 | 135 | print('input:') 136 | print(input.shape) 137 | print('output:') 138 | print(output.shape) 139 | -------------------------------------------------------------------------------- /flow/core/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/flow/core/utils/__init__.py -------------------------------------------------------------------------------- /flow/demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python evaluate_single.py --model checkpoints/gma-sintel.pth --path imgs 3 | -------------------------------------------------------------------------------- /flow/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python evaluate.py --model checkpoints/gma-chairs.pth --dataset chairs 3 | python evaluate.py --model checkpoints/gma-things.pth --dataset sintel 4 | python evaluate.py --model checkpoints/gma-sintel.pth --dataset sintel 5 | python evaluate.py --model checkpoints/gma-kitti.pth --dataset kitti -------------------------------------------------------------------------------- /flow/evaluate_single.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('core') 4 | 5 | import argparse 6 | import os 7 | import cv2 8 | import glob 9 | import numpy as np 10 | import torch 11 | from PIL import Image 12 | import imageio 13 | import matplotlib.pyplot as plt 14 | 15 | from network import RAFTGMA 16 | from utils import flow_viz 17 | from utils.utils import InputPadder 18 | import os 19 | 20 | 21 | DEVICE = 'cuda' 22 | 23 | 24 | def load_image(imfile): 25 | img = np.array(Image.open(imfile)).astype(np.uint8) 26 | img = torch.from_numpy(img).permute(2, 0, 1).float() 27 | return img[None].to(DEVICE) 28 | 29 | 30 | def viz(img, flo, flow_dir): 31 | img = img[0].permute(1, 2, 0).cpu().numpy() 32 | flo = flo[0].permute(1, 2, 0).cpu().numpy() 33 | 34 | # map flow to rgb image 35 | flo = flow_viz.flow_to_image(flo) 36 | 37 | imageio.imwrite(os.path.join(flow_dir, 'flo.png'), flo) 38 | print(f"Saving optical flow visualisation at {os.path.join(flow_dir, 'flo.png')}") 39 | 40 | 41 | def normalize(x): 42 | return x / (x.max() - x.min()) 43 | 44 | 45 | def demo(args): 46 | model = torch.nn.DataParallel(RAFTGMA(args)) 47 | model.load_state_dict(torch.load(args.model)) 48 | print(f"Loaded checkpoint at {args.model}") 49 | 50 | model = model.module 51 | model.to(DEVICE) 52 | model.eval() 53 | 54 | flow_dir = os.path.join(args.path, args.model_name) 55 | if not os.path.exists(flow_dir): 56 | os.makedirs(flow_dir) 57 | 58 | with torch.no_grad(): 59 | images = glob.glob(os.path.join(args.path, '*.png')) + \ 60 | glob.glob(os.path.join(args.path, '*.jpg')) 61 | 62 | images = sorted(images) 63 | 64 | for imfile1, imfile2 in zip(images[:-1], images[1:]): 65 | image1 = load_image(imfile1) 66 | image2 = load_image(imfile2) 67 | print(f"Reading in images at {imfile1} and {imfile2}") 68 | 69 | padder = InputPadder(image1.shape) 70 | image1, image2 = padder.pad(image1, image2) 71 | 72 | flow_low, flow_up = model(image1, image2, iters=12, test_mode=True) 73 | print(f"Estimating optical flow...") 74 | 75 | viz(image1, flow_up, flow_dir) 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('--model', help="restore checkpoint") 81 | parser.add_argument('--model_name', help="define model name", default="GMA") 82 | parser.add_argument('--path', help="dataset for evaluation") 83 | parser.add_argument('--num_heads', default=1, type=int, 84 | help='number of heads in attention and aggregation') 85 | parser.add_argument('--position_only', default=False, action='store_true', 86 | help='only use position-wise attention') 87 | parser.add_argument('--position_and_content', default=False, action='store_true', 88 | help='use position and content-wise attention') 89 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 90 | args = parser.parse_args() 91 | 92 | demo(args) 93 | -------------------------------------------------------------------------------- /flow/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python train.py --name gma-chairs --stage chairs --validation chairs --output results/chairs/gma --num_steps 120000 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --gpus 0 1 --batch_size 8 --val_freq 10000 --print_freq 100 --mixed_precision 3 | python train.py --name gma-things --stage things --validation sintel --output results/things/gma --restore_ckpt results/chairs/gma/gma-chairs.pth --num_steps 120000 --lr 0.000125 --image_size 400 720 --wdecay 0.0001 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision 4 | python train.py --name gma-sintel --stage sintel --validation sintel --output results/sintel/gma --restore_ckpt results/things/gma/gma-things.pth --num_steps 120000 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma 0.85 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision 5 | python train.py --name gma-kitti --stage kitti --validation kitti --output results/kitti/gma --restore_ckpt results/sintel/gma/gma-sintel.pth --num_steps 50000 --lr 0.000125 --image_size 288 960 --wdecay 0.00001 --gamma 0.85 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision 6 | -------------------------------------------------------------------------------- /invert_font_size.py: -------------------------------------------------------------------------------- 1 | # Author: Ankush Gupta 2 | # Date: 2015 3 | "Script to generate font-models." 4 | 5 | import pygame 6 | from pygame import freetype 7 | from text_utils import FontState 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import cPickle as cp 11 | 12 | 13 | pygame.init() 14 | 15 | 16 | ys = np.arange(8,200) 17 | A = np.c_[ys,np.ones_like(ys)] 18 | 19 | xs = [] 20 | models = {} #linear model 21 | 22 | FS = FontState() 23 | #plt.figure() 24 | for i in xrange(len(FS.fonts)): 25 | print i 26 | font = freetype.Font(FS.fonts[i], size=12) 27 | h = [] 28 | for y in ys: 29 | h.append(font.get_sized_glyph_height(y)) 30 | h = np.array(h) 31 | m,_,_,_ = np.linalg.lstsq(A,h) 32 | models[font.name] = m 33 | xs.append(h) 34 | 35 | with open('font_px2pt.cp','w') as f: 36 | cp.dump(models,f) 37 | #plt.plot(xs,ys[i]) 38 | #plt.show() 39 | -------------------------------------------------------------------------------- /params.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | params = { 5 | 'text' : { 6 | 'p_text' : {0.7: 'WORD', 0.25: 'LINE', 0.05: 'PARA'}, #{0.7: 'WORD', 0.25: 'LINE', 0.05: 'PARA'}, 7 | 'size' : [50, 10], 8 | 'source' : 'newsgroup/newsgroup.txt', 9 | }, 10 | 'color' : { 11 | 'source' : 'models/colors_new.cp', 12 | 'merge_range' : (0.72, 0.88, 1.0), 13 | 'color_dis' : 0 # 0 14 | }, 15 | 'depth' : { 16 | 'range' : (0.1, 100) # (0.1,100) 17 | }, 18 | 'method' : { 19 | 'version' : 'v4', # v2 | base | v3 20 | 'region_reuse' : 3, 21 | 'postprocess' : 'hw', # hw | None 22 | 'shelter' : False, 23 | 'overlap' : False, # no overlaping text instances 24 | }, 25 | 'generator' : { 26 | 'save' : 'gen_data/joint_10f_909_large', 27 | 'seed' : 18, # random seed 28 | 'tasks' : None,#'gen_data/act_10f_813_base_1k/task.pkl', # 'data/models/tasks_act.pkl' 29 | 'datasets' : ['data/backgrounds/activitynet.txt'], #'data/backgrounds/activitynet.txt', 'data/backgrounds/got10k.txt', 'data/backgrounds/ytvis.txt'], 30 | 'num_workers' : 6, 31 | 'mode' : 'random', # random | round 32 | 'frame_itv' : 5 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /prep_scripts/floodFill.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python script to "flood-fill" the segments computed using gPb-UCM. 3 | This assings the same integer label to all the pixels in the same segment. 4 | 5 | Author: Ankush Gupta 6 | """ 7 | 8 | from __future__ import division 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import cv2 12 | import scipy.io as sio 13 | import h5py 14 | import os.path as osp 15 | import multiprocessing as mp 16 | import traceback, sys 17 | 18 | def get_seed(sx,sy,ucm): 19 | n = sx.size 20 | for i in xrange(n): 21 | if ucm[sx[i]+1,sy[i]+1] == 0: 22 | return (sy[i],sx[i]) 23 | 24 | def get_mask(ucm,viz=False): 25 | ucm = ucm.copy() 26 | h,w = ucm.shape[:2] 27 | mask = np.zeros((h-2,w-2),'float32') 28 | 29 | i = 0 30 | sx,sy = np.where(mask==0) 31 | seed = get_seed(sx,sy,ucm) 32 | areas = [] 33 | labels=[] 34 | while seed is not None and i<1000: 35 | cv2.floodFill(mask,ucm,seed,i+1) 36 | # calculate the area (no. of pixels): 37 | areas.append(np.sum(mask==i+1)) 38 | labels.append(i+1) 39 | 40 | # get the location of the next seed: 41 | sx,sy = np.where(mask==0) 42 | seed = get_seed(sx,sy,ucm) 43 | i += 1 44 | print " > terminated in %d steps"%i 45 | 46 | if viz: 47 | plt.imshow(mask) 48 | plt.show() 49 | 50 | return mask,np.array(areas),np.array(labels) 51 | 52 | def get_mask_parallel(ucm_imname): 53 | ucm,imname = ucm_imname 54 | try: 55 | return (get_mask(ucm.T),imname) 56 | except: 57 | return None 58 | #traceback.print_exc(file=sys.stdout) 59 | 60 | def process_db_parallel(base_dir, th=0.11): 61 | """ 62 | Get segmentation masks from gPb contours. 63 | """ 64 | db_path = osp.join(base_dir,'ucm.mat') 65 | out_path = osp.join(base_dir,'seg_uint16.h5') 66 | # output h5 file: 67 | dbo = h5py.File(out_path,'w') 68 | dbo_mask = dbo.create_group("mask") 69 | 70 | class ucm_iterable(object): 71 | def __init__(self,ucm_path,th): 72 | self.th = th 73 | self.ucm_h5 = h5py.File(db_path,'r') 74 | self.N = self.ucm_h5['names'].size 75 | self.i = 0 76 | 77 | def __iter__(self): 78 | return self 79 | 80 | def get_imname(self,i): 81 | return "".join(map(chr, self.ucm_h5[self.ucm_h5['names'][0,self.i]][:])) 82 | 83 | def __stop__(self): 84 | print "DONE" 85 | self.ucm_h5.close() 86 | raise StopIteration 87 | 88 | def get_valid_name(self): 89 | if self.i >= self.N: 90 | self.__stop__() 91 | 92 | imname = self.get_imname(self.i) 93 | while self.i < self.N-1 and len(imname) < 4: 94 | self.i += 1 95 | imname = self.get_imname(self.i) 96 | 97 | if len(imname) < 4: 98 | self.__stop__() 99 | 100 | return imname 101 | 102 | def next(self): 103 | imname = self.get_valid_name() 104 | print "%d of %d"%(self.i+1,self.N) 105 | ucm = self.ucm_h5[self.ucm_h5['ucms'][0,self.i]][:] 106 | ucm = ucm.copy() 107 | self.i += 1 108 | return ((ucm>self.th).astype('uint8'),imname) 109 | 110 | ucm_iter = ucm_iterable(db_path,th) 111 | print "cpu count: ", mp.cpu_count() 112 | parpool = mp.Pool(4) 113 | ucm_result = parpool.imap_unordered(get_mask_parallel, ucm_iter, chunksize=1) 114 | 115 | for res in ucm_result: 116 | if res is None: 117 | continue 118 | ((mask,area,label),imname) = res 119 | print "got back : ", imname 120 | mask = mask.astype('uint16') 121 | mask_dset = dbo_mask.create_dataset(imname, data=mask) 122 | mask_dset.attrs['area'] = area 123 | mask_dset.attrs['label'] = label 124 | 125 | # close the h5 files: 126 | print "closing DB" 127 | dbo.close() 128 | print ">>>> DONE" 129 | 130 | 131 | base_dir = '/home/' # directory containing the ucm.mat, i.e., output of run_ucm.m 132 | process_db_parallel(base_dir) 133 | -------------------------------------------------------------------------------- /prep_scripts/predict_depth.m: -------------------------------------------------------------------------------- 1 | % MATLAB script to regress a depth mask for an image. 2 | % uses: (1) https://bitbucket.org/fayao/dcnf-fcsp/ 3 | % (2) vlfeat 4 | % (3) matconvnet 5 | 6 | % Author: Ankush Gupta 7 | 8 | function predict_depth() 9 | % setup vlfeat 10 | run( '../libs/vlfeat-0.9.18/toolbox/vl_setup'); 11 | % setup matconvnet 12 | dir_matConvNet='../libs/matconvnet/matlab/'; 13 | addpath(genpath(dir_matConvNet)); 14 | run([dir_matConvNet 'vl_setupnn.m']); 15 | 16 | opts=[]; 17 | opts.useGpu=true; 18 | opts.inpaint = true; 19 | opts.normalize_depth = false; % limit depth to [0,1] 20 | opts.imdir = '/path/to/image/dir'; 21 | 22 | opts.out_h5 = '/path/to/save/output/depth.h5'; 23 | 24 | % these should point to the pre-trained models from: 25 | % https://bitbucket.org/fayao/dcnf-fcsp/ 26 | opts.model_file.indoor = '../model_trained/model_dcnf-fcsp_NYUD2'; 27 | opts.model_file.outdoor = '../model_trained/model_dcnf-fcsp_Make3D'; 28 | 29 | fprintf('\nloading trained model...\n\n'); 30 | mdl = load(opts.model_file.indoor); 31 | model.indoor = mdl.data_obj; 32 | mdl = load(opts.model_file.outdoor); 33 | model.outdoor = mdl.data_obj; 34 | 35 | if gpuDeviceCount==0 36 | fprintf(' ** No GPU found. Using CPU...\n'); 37 | opts.useGpu=false; 38 | end 39 | 40 | imnames = dir(fullfile(opts.imdir),'*'); 41 | imnames = {imnames.name}; 42 | N = numel(imnames); 43 | for i = 1:N 44 | fprintf('%d of %d\n',i,N); 45 | imname = imnames{i}; 46 | imtype = 'outdoor'; 47 | img = read_img_rgb(fullfile(opts.imdir,imname)); 48 | if strcmp(imtype, 'outdoor') 49 | opts.sp_size=16; 50 | opts.max_edge=600; 51 | elseif strcmp(imtype, 'indoor') 52 | opts.sp_size=20; 53 | opts.max_edge=640; 54 | end 55 | depth = get_depth(img,model.(imtype),opts); 56 | save_depth(imname,depth,opts); 57 | end 58 | end 59 | 60 | function save_depth(imname,depth,opts) 61 | dset_name = ['/',imname]; 62 | h5create(opts.out_h5, dset_name, size(depth), 'Datatype', 'single'); 63 | h5write(opts.out_h5, dset_name, depth); 64 | end 65 | 66 | function depth = get_depth(im_rgb,model,opts) 67 | % limit the maximum edge size of the image: 68 | if ~isempty(opts.max_edge) 69 | sz = size(im_rgb); 70 | [~,max_dim] = max(sz(1:2)); 71 | osz = NaN*ones(1,2); 72 | osz(max_dim) = opts.max_edge; 73 | im_rgb = imresize(im_rgb, osz); 74 | end 75 | 76 | % do super-pixels: 77 | fprintf(' > super-pix\n'); 78 | supix = gen_supperpixel_info(im_rgb, opts.sp_size); 79 | pinfo = gen_feature_info_pairwise(im_rgb, supix); 80 | 81 | % build "data-set": 82 | ds=[]; 83 | ds.img_idxes = 1; 84 | ds.img_data = im_rgb; 85 | ds.sp_info{1} = supix; 86 | ds.pws_info = pinfo; 87 | ds.sp_num_imgs = supix.sp_num; 88 | % run cnn: 89 | fprintf(' > CNN\n'); 90 | depth = do_model_evaluate(model, ds, opts); 91 | 92 | if opts.inpaint 93 | fprintf(' > inpaint\n'); 94 | depth = do_inpainting(depth, im_rgb, supix); 95 | end 96 | 97 | if opts.normalize_depth 98 | d_min = min(depth(:)); 99 | d_max = max(depth(:)); 100 | depth = (depth-d_min) / (d_max-d_min); 101 | depth(depth<0) = 0; 102 | depth(depth>1) = 1; 103 | end 104 | end -------------------------------------------------------------------------------- /prep_scripts/run_ucm.m: -------------------------------------------------------------------------------- 1 | % MATLAB script to get Ultrametric Contour Maps for images: 2 | % Clone this github repo first: 3 | % https://github.com/jponttuset/mcg/tree/master/pre-trained 4 | % 5 | % Author: Ankush Gupta 6 | 7 | % path to the directory containing images, which need to be segmented 8 | img_dir = 'dir/containing/images'; 9 | % path to the mcg/pre-trained directory. 10 | mcg_dir = '/path/to/mcg/pre-trained'; 11 | 12 | imsize = [240,NaN]; 13 | % "install" the MCG toolbox: 14 | run(fullfile(mcg_dir,'install.m')); 15 | 16 | % get the image names: 17 | imname = dir(fullfile(img_dir,'*')); 18 | imname = {imname.name}; 19 | 20 | % process: 21 | names = cell(numel(imname),1); 22 | ucms = cell(numel(imname),1); 23 | 24 | %parpool('AGLocal',4); 25 | parfor i = 1:numel(imname) 26 | fprintf('%d of %d\n',i,numel(imname)); 27 | try 28 | im_name = fullfile(img_dir,imname{i}); 29 | im = imread(im_name); 30 | catch 31 | fprintf('err\n'); 32 | continue; 33 | end 34 | im = uint8(imresize(im,imsize)); 35 | names{i} = imname{i}; 36 | ucms{i} = im2ucm(im,'fast'); 37 | end 38 | save('ucm.mat','ucms','names','-v7.3'); 39 | -------------------------------------------------------------------------------- /ransac.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import random 3 | import numpy as np 4 | 5 | 6 | def fit_plane(xyz,z_pos=None): 7 | """ 8 | if z_pos is not None, the sign 9 | of the normal is flipped to make 10 | the dot product with z_pos (+). 11 | """ 12 | mean = np.mean(xyz,axis=0) 13 | xyz_c = xyz - mean[None,:] 14 | l,v = np.linalg.eig(xyz_c.T.dot(xyz_c)) 15 | abc = v[:,np.argmin(l)] 16 | d = -np.sum(abc*mean) 17 | # unit-norm the plane-normal: 18 | abcd = np.r_[abc,d]/np.linalg.norm(abc) 19 | # flip the normal direction: 20 | if z_pos is not None: 21 | if np.sum(abcd[:3]*z_pos) < 0.0: 22 | abcd *= -1 23 | return abcd 24 | 25 | def fit_plane_ransac(pts, neighbors=None,z_pos=None, dist_inlier=0.05, 26 | min_inlier_frac=0.60, nsample=3, max_iter=100): 27 | """ 28 | Fits a 3D plane model using RANSAC. 29 | pts : (nx3 array) of point coordinates 30 | """ 31 | n,_ = pts.shape 32 | ninlier,models = [],[] 33 | for i in range(max_iter): 34 | if neighbors is None: 35 | p = pts[np.random.choice(pts.shape[0],nsample,replace=False),:] 36 | else: 37 | p = pts[neighbors[:,i],:] 38 | m = fit_plane(p,z_pos) 39 | ds = np.abs(pts.dot(m[:3])+m[3]) 40 | nin = np.sum(ds < dist_inlier) 41 | if nin/pts.shape[0] >= min_inlier_frac: 42 | ninlier.append(nin) 43 | models.append(m) 44 | 45 | if models == []: 46 | print ("RANSAC plane fitting failed!") 47 | return #None 48 | else: #refit the model to inliers: 49 | ninlier = np.array(ninlier) 50 | best_model_idx = np.argsort(-ninlier) 51 | n_refit, m_refit, inliers = [],[],[] 52 | for idx in best_model_idx[:min(10,len(best_model_idx))]: 53 | # re-estimate the model based on inliers: 54 | dists = np.abs(pts.dot(models[idx][:3])+models[idx][3]) 55 | inlier = dists < dist_inlier 56 | m = fit_plane(pts[inlier,:],z_pos) 57 | # compute new inliers: 58 | d = np.abs(pts.dot(m[:3])+m[3]) 59 | inlier = d < dist_inlier/2 # heuristic 60 | n_refit.append(np.sum(inlier)) 61 | m_refit.append(m) 62 | inliers.append(inlier) 63 | best_plane = np.argmax(n_refit) 64 | return m_refit[best_plane],inliers[best_plane] 65 | 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | from matplotlib import pylab 71 | from mpl_toolkits import mplot3d 72 | fig = pylab.figure() 73 | ax = mplot3d.Axes3D(fig) 74 | 75 | def plot_plane(a, b, c, d): 76 | xx, yy = np.mgrid[10:20, 10:20] 77 | return xx, yy, (-d - a * xx - b * yy) / c 78 | 79 | n = 100 80 | max_iterations = 100 81 | goal_inliers = n * 0.3 82 | 83 | # test data 84 | xyzs = np.random.random((n, 3)) * 10 + 10 85 | xyzs[:90, 2:] = xyzs[:90, :1] 86 | 87 | ax.scatter3D(xyzs.T[0], xyzs.T[1], xyzs.T[2]) 88 | 89 | # RANSAC 90 | m, b = run_ransac(xyzs, estimate, lambda x, y: is_inlier(x, y, 0.01), 3, goal_inliers, max_iterations) 91 | a, b, c, d = m 92 | xx, yy, zz = plot_plane(a, b, c, d) 93 | ax.plot_surface(xx, yy, zz, color=(0, 1, 0, 0.5)) 94 | plt.show() 95 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | antlr4-python3-runtime==4.9.3 3 | black==23.3.0 4 | cachetools==5.3.0 5 | certifi==2022.12.7 6 | charset-normalizer==3.1.0 7 | click==8.1.3 8 | cloudpickle==2.2.1 9 | contourpy==1.0.7 10 | cycler==0.11.0 11 | Cython==0.29.34 12 | -e git+https://github.com/facebookresearch/detectron2.git@e020497c85873c2b811ac87dd2e4a34a806e4c2b#egg=detectron2 13 | einops==0.6.1 14 | filelock==3.12.0 15 | fonttools==4.39.3 16 | fsspec==2023.4.0 17 | fvcore==0.1.5.post20221221 18 | google-auth==2.17.3 19 | google-auth-oauthlib==1.0.0 20 | grpcio==1.54.0 21 | h5py==3.8.0 22 | huggingface-hub==0.14.1 23 | hydra-core==1.3.2 24 | idna==3.4 25 | imageio==2.28.1 26 | importlib-metadata==6.6.0 27 | importlib-resources==5.12.0 28 | iopath==0.1.9 29 | kiwisolver==1.4.4 30 | lazy_loader==0.2 31 | Markdown==3.4.3 32 | MarkupSafe==2.1.2 33 | matplotlib==3.7.1 34 | mypy-extensions==1.0.0 35 | networkx==3.1 36 | numpy==1.24.3 37 | oauthlib==3.2.2 38 | omegaconf==2.3.0 39 | opencv-python==4.7.0.72 40 | packaging==23.1 41 | pathspec==0.11.1 42 | Pillow==9.5.0 43 | platformdirs==3.5.0 44 | portalocker==2.7.0 45 | protobuf==4.22.3 46 | pyasn1==0.5.0 47 | pyasn1-modules==0.3.0 48 | pycocotools==2.0.6 49 | pygame==2.0.0 50 | pyparsing==3.0.9 51 | python-dateutil==2.8.2 52 | PyWavelets==1.4.1 53 | PyYAML==6.0 54 | requests==2.29.0 55 | requests-oauthlib==1.3.1 56 | rsa==4.9 57 | scikit-image==0.20.0 58 | scipy==1.9.1 59 | shapely==2.0.1 60 | six==1.16.0 61 | submitit==1.4.5 62 | tabulate==0.9.0 63 | tensorboard==2.12.2 64 | tensorboard-data-server==0.7.0 65 | tensorboard-plugin-wit==1.8.1 66 | termcolor==2.3.0 67 | tifffile==2023.4.12 68 | timm==0.6.13 69 | tomli==2.0.1 70 | tqdm==4.65.0 71 | typing_extensions==4.5.0 72 | urllib3==1.26.15 73 | Werkzeug==2.3.3 74 | wget==3.2 75 | yacs==0.1.8 76 | zipp==3.15.0 77 | -------------------------------------------------------------------------------- /segmentation/.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet -------------------------------------------------------------------------------- /segmentation/ADVANCED_USAGE.md: -------------------------------------------------------------------------------- 1 | ## Advanced Usage of Mask2Former 2 | 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose. 4 | 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder. 6 | You can easily replace each of these three components with your own implementation. 7 | 8 | ### Test Mask2Former with your own backbone 9 | 10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example. 11 | 2. Change the config file accordingly. 12 | 13 | ### Test Mask2Former with your own pixel decoder 14 | 15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`. 16 | 2. Change the config file accordingly. 17 | 18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values: 19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks. 20 | 2. `None`, you can simply return `None` for the second value. 21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3. 22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here. 23 | 24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn: 25 | ``` 26 | MODEL: 27 | SEM_SEG_HEAD: 28 | # pixel decoder 29 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 30 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | ``` 34 | 35 | ### Build a new Transformer decoder. 36 | 37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`. 38 | -------------------------------------------------------------------------------- /segmentation/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /segmentation/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to maskformer2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) 36 | 37 | ## License 38 | By contributing to MaskFormer, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /segmentation/GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with Mask2Former 2 | 3 | This document provides a brief intro of the usage of Mask2Former. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | 8 | ### Inference Demo with Pre-trained Models 9 | 10 | 1. Pick a model and its config file from 11 | [model zoo](MODEL_ZOO.md), 12 | for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`. 13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with: 14 | ``` 15 | cd demo/ 16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 17 | --input input1.jpg input2.jpg \ 18 | [--other-options] 19 | --opts MODEL.WEIGHTS /path/to/checkpoint_file 20 | ``` 21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation. 22 | This command will run the inference and show visualizations in an OpenCV window. 23 | 24 | For details of the command line arguments, see `demo.py -h` or look at its source code 25 | to understand its behavior. Some common arguments are: 26 | * To run __on your webcam__, replace `--input files` with `--webcam`. 27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`. 28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`. 29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`. 30 | 31 | 32 | ### Training & Evaluation in Command Line 33 | 34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former. 35 | 36 | To train a model with "train_net.py", first 37 | setup the corresponding datasets following 38 | [datasets/README.md](./datasets/README.md), 39 | then run: 40 | ``` 41 | python train_net.py --num-gpus 8 \ 42 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml 43 | ``` 44 | 45 | The configs are made for 8-GPU training. 46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size. 47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself: 48 | ``` 49 | python train_net.py \ 50 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 51 | --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE 52 | ``` 53 | 54 | To evaluate a model's performance, use 55 | ``` 56 | python train_net.py \ 57 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 58 | --eval-only MODEL.WEIGHTS /path/to/checkpoint_file 59 | ``` 60 | For more options, see `python train_net.py -h`. 61 | 62 | 63 | ### Video instance segmentation 64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train 65 | and evaluate video instance segmentation models. 66 | -------------------------------------------------------------------------------- /segmentation/INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd mask2former/modeling/pixel_decoder/ops 19 | sh make.sh 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name mask2former python=3.8 -y 31 | conda activate mask2former 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | git clone git@github.com:facebookresearch/detectron2.git 37 | cd detectron2 38 | pip install -e . 39 | pip install git+https://github.com/cocodataset/panopticapi.git 40 | pip install git+https://github.com/mcordts/cityscapesScripts.git 41 | 42 | cd .. 43 | git clone git@github.com:facebookresearch/Mask2Former.git 44 | cd Mask2Former 45 | pip install -r requirements.txt 46 | cd mask2former/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | -------------------------------------------------------------------------------- /segmentation/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Meta, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /segmentation/README.md: -------------------------------------------------------------------------------- 1 | # Mask2Former: Masked-attention Mask Transformer for Universal Image Segmentation (CVPR 2022) 2 | 3 | [Bowen Cheng](https://bowenc0221.github.io/), [Ishan Misra](https://imisra.github.io/), [Alexander G. Schwing](https://alexander-schwing.de/), [Alexander Kirillov](https://alexander-kirillov.github.io/), [Rohit Girdhar](https://rohitgirdhar.github.io/) 4 | 5 | [[`arXiv`](https://arxiv.org/abs/2112.01527)] [[`Project`](https://bowenc0221.github.io/mask2former)] [[`BibTeX`](#CitingMask2Former)] 6 | 7 |
8 | 9 |

10 | 11 | ### Features 12 | * A single architecture for panoptic, instance and semantic segmentation. 13 | * Support major segmentation datasets: ADE20K, Cityscapes, COCO, Mapillary Vistas. 14 | 15 | ## Updates 16 | * Add Google Colab demo. 17 | * Video instance segmentation is now supported! Please check our [tech report](https://arxiv.org/abs/2112.10764) for more details. 18 | 19 | ## Installation 20 | 21 | See [installation instructions](INSTALL.md). 22 | 23 | ## Getting Started 24 | 25 | See [Preparing Datasets for Mask2Former](datasets/README.md). 26 | 27 | See [Getting Started with Mask2Former](GETTING_STARTED.md). 28 | 29 | Run our demo using Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1uIWE5KbGFSjrxey2aRd5pWkKNY1_SaNq) 30 | 31 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/Mask2Former) 32 | 33 | Replicate web demo and docker image is available here: [![Replicate](https://replicate.com/facebookresearch/mask2former/badge)](https://replicate.com/facebookresearch/mask2former) 34 | 35 | ## Advanced usage 36 | 37 | See [Advanced Usage of Mask2Former](ADVANCED_USAGE.md). 38 | 39 | ## Model Zoo and Baselines 40 | 41 | We provide a large set of baseline results and trained models available for download in the [Mask2Former Model Zoo](MODEL_ZOO.md). 42 | 43 | ## License 44 | 45 | Shield: [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 46 | 47 | The majority of Mask2Former is licensed under a [MIT License](LICENSE). 48 | 49 | 50 | However portions of the project are available under separate license terms: Swin-Transformer-Semantic-Segmentation is licensed under the [MIT license](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/LICENSE), Deformable-DETR is licensed under the [Apache-2.0 License](https://github.com/fundamentalvision/Deformable-DETR/blob/main/LICENSE). 51 | 52 | ## Citing Mask2Former 53 | 54 | If you use Mask2Former in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry. 55 | 56 | ```BibTeX 57 | @inproceedings{cheng2021mask2former, 58 | title={Masked-attention Mask Transformer for Universal Image Segmentation}, 59 | author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar}, 60 | journal={CVPR}, 61 | year={2022} 62 | } 63 | ``` 64 | 65 | If you find the code useful, please also consider the following BibTeX entry. 66 | 67 | ```BibTeX 68 | @inproceedings{cheng2021maskformer, 69 | title={Per-Pixel Classification is Not All You Need for Semantic Segmentation}, 70 | author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov}, 71 | journal={NeurIPS}, 72 | year={2021} 73 | } 74 | ``` 75 | 76 | ## Acknowledgement 77 | 78 | Code is largely based on MaskFormer (https://github.com/facebookresearch/MaskFormer). 79 | -------------------------------------------------------------------------------- /segmentation/cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | gpu: true 3 | cuda: "10.1" 4 | python_version: "3.8" 5 | system_packages: 6 | - "libgl1-mesa-glx" 7 | - "libglib2.0-0" 8 | python_packages: 9 | - "ipython==7.30.1" 10 | - "numpy==1.21.4" 11 | - "torch==1.8.1" 12 | - "torchvision==0.9.1" 13 | - "opencv-python==4.5.5.62" 14 | - "Shapely==1.8.0" 15 | - "h5py==3.6.0" 16 | - "scipy==1.7.3" 17 | - "submitit==1.4.1" 18 | - "scikit-image==0.19.1" 19 | - "Cython==0.29.27" 20 | - "timm==0.4.12" 21 | run: 22 | - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html 23 | - pip install git+https://github.com/cocodataset/panopticapi.git 24 | - pip install git+https://github.com/mcordts/cityscapesScripts.git 25 | - git clone https://github.com/facebookresearch/Mask2Former 26 | - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install 27 | 28 | predict: "predict.py:Predictor" 29 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_instance_train",) 18 | TEST: ("ade20k_instance_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 100 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST: ("ade20k_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST: ("mapillary_vistas_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /segmentation/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2019_train",) 19 | TEST: ("ytvis_2019_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (4000,) 24 | MAX_ITER: 6000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2021_train",) 19 | TEST: ("ytvis_2021_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (5500,) 24 | MAX_ITER: 8000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | # OOM when using a larger test size 20 | # INPUT: 21 | # MIN_SIZE_TEST: 480 22 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /segmentation/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /segmentation/datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /segmentation/datasets/prepare_ade20k_ins_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import glob 5 | import json 6 | import os 7 | from collections import Counter 8 | 9 | import numpy as np 10 | import tqdm 11 | from panopticapi.utils import IdGenerator, save_json 12 | from PIL import Image 13 | import pycocotools.mask as mask_util 14 | 15 | 16 | if __name__ == "__main__": 17 | dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets") 18 | 19 | for name, dirname in [("train", "training"), ("val", "validation")]: 20 | image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/") 21 | instance_dir = os.path.join( 22 | dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/" 23 | ) 24 | 25 | # img_id = 0 26 | ann_id = 1 27 | 28 | # json 29 | out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json") 30 | 31 | # json config 32 | instance_config_file = "datasets/ade20k_instance_imgCatIds.json" 33 | with open(instance_config_file) as f: 34 | category_dict = json.load(f)["categories"] 35 | 36 | # load catid mapping 37 | # it is important to share category id for both instance and panoptic annotations 38 | mapping_file = "datasets/ade20k_instance_catid_mapping.txt" 39 | with open(mapping_file) as f: 40 | map_id = {} 41 | for i, line in enumerate(f.readlines()): 42 | if i == 0: 43 | continue 44 | ins_id, sem_id, _ = line.strip().split() 45 | # shift id by 1 because we want it to start from 0! 46 | # ignore_label becomes 255 47 | map_id[int(ins_id)] = int(sem_id) - 1 48 | 49 | for cat in category_dict: 50 | cat["id"] = map_id[cat["id"]] 51 | 52 | filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg"))) 53 | 54 | ann_dict = {} 55 | images = [] 56 | annotations = [] 57 | 58 | for idx, filename in enumerate(tqdm.tqdm(filenames)): 59 | image = {} 60 | image_id = os.path.basename(filename).split(".")[0] 61 | 62 | image["id"] = image_id 63 | image["file_name"] = os.path.basename(filename) 64 | 65 | original_format = np.array(Image.open(filename)) 66 | image["width"] = original_format.shape[1] 67 | image["height"] = original_format.shape[0] 68 | 69 | images.append(image) 70 | 71 | filename_instance = os.path.join(instance_dir, image_id + ".png") 72 | ins_seg = np.asarray(Image.open(filename_instance)) 73 | assert ins_seg.dtype == np.uint8 74 | 75 | instance_cat_ids = ins_seg[..., 0] 76 | # instance id starts from 1! 77 | # because 0 is reserved as VOID label 78 | instance_ins_ids = ins_seg[..., 1] 79 | 80 | # process things 81 | for thing_id in np.unique(instance_ins_ids): 82 | if thing_id == 0: 83 | continue 84 | mask = instance_ins_ids == thing_id 85 | instance_cat_id = np.unique(instance_cat_ids[mask]) 86 | assert len(instance_cat_id) == 1 87 | 88 | anno = {} 89 | anno['id'] = ann_id 90 | ann_id += 1 91 | anno['image_id'] = image['id'] 92 | anno["iscrowd"] = int(0) 93 | anno["category_id"] = int(map_id[instance_cat_id[0]]) 94 | 95 | inds = np.nonzero(mask) 96 | ymin, ymax = inds[0].min(), inds[0].max() 97 | xmin, xmax = inds[1].min(), inds[1].max() 98 | anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)] 99 | # if xmax <= xmin or ymax <= ymin: 100 | # continue 101 | rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 102 | rle["counts"] = rle["counts"].decode("utf-8") 103 | anno["segmentation"] = rle 104 | anno["area"] = int(mask_util.area(rle)) 105 | annotations.append(anno) 106 | 107 | # save this 108 | ann_dict['images'] = images 109 | ann_dict['categories'] = category_dict 110 | ann_dict['annotations'] = annotations 111 | 112 | save_json(ann_dict, out_file) 113 | -------------------------------------------------------------------------------- /segmentation/datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /segmentation/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /segmentation/demo/README.md: -------------------------------------------------------------------------------- 1 | ## Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /segmentation/demo_video/README.md: -------------------------------------------------------------------------------- 1 | ## Video Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /segmentation/mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | 25 | # evaluation 26 | from .evaluation.instance_evaluation import InstanceSegEvaluator 27 | -------------------------------------------------------------------------------- /segmentation/mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /segmentation/mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | ) 11 | -------------------------------------------------------------------------------- /segmentation/mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/segmentation/mask2former/evaluation/__init__.py -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /segmentation/mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /segmentation/mask2former/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /segmentation/mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import modeling 3 | 4 | # config 5 | from .config import add_maskformer2_video_config 6 | 7 | # models 8 | from .video_maskformer_model import VideoMaskFormer 9 | 10 | # video 11 | from .data_video import ( 12 | YTVISDatasetMapper, 13 | YTVISEvaluator, 14 | build_detection_train_loader, 15 | build_detection_test_loader, 16 | get_detection_dataset_dicts, 17 | ) 18 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_video_config(cfg): 7 | # video data 8 | # DataLoader 9 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 10 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 11 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 12 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 13 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import os 5 | 6 | from .ytvis import ( 7 | register_ytvis_instances, 8 | _get_ytvis_2019_instances_meta, 9 | _get_ytvis_2021_instances_meta, 10 | ) 11 | 12 | # ==== Predefined splits for YTVIS 2019 =========== 13 | _PREDEFINED_SPLITS_YTVIS_2019 = { 14 | "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", 15 | "ytvis_2019/train.json"), 16 | "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", 17 | "ytvis_2019/valid.json"), 18 | "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", 19 | "ytvis_2019/test.json"), 20 | } 21 | 22 | 23 | # ==== Predefined splits for YTVIS 2021 =========== 24 | _PREDEFINED_SPLITS_YTVIS_2021 = { 25 | "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", 26 | "ytvis_2021/train.json"), 27 | "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", 28 | "ytvis_2021/valid.json"), 29 | "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", 30 | "ytvis_2021/test.json"), 31 | } 32 | 33 | 34 | def register_all_ytvis_2019(root): 35 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 36 | # Assume pre-defined datasets live in `./datasets`. 37 | register_ytvis_instances( 38 | key, 39 | _get_ytvis_2019_instances_meta(), 40 | os.path.join(root, json_file) if "://" not in json_file else json_file, 41 | os.path.join(root, image_root), 42 | ) 43 | 44 | 45 | def register_all_ytvis_2021(root): 46 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 47 | # Assume pre-defined datasets live in `./datasets`. 48 | register_ytvis_instances( 49 | key, 50 | _get_ytvis_2021_instances_meta(), 51 | os.path.join(root, json_file) if "://" not in json_file else json_file, 52 | os.path.join(root, image_root), 53 | ) 54 | 55 | 56 | if __name__.endswith(".builtin"): 57 | # Assume pre-defined datasets live in `./datasets`. 58 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 59 | register_all_ytvis_2019(_root) 60 | register_all_ytvis_2021(_root) 61 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine3D(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | # b, t, c, h, w 31 | assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" 32 | if mask is None: 33 | mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) 34 | not_mask = ~mask 35 | z_embed = not_mask.cumsum(1, dtype=torch.float32) 36 | y_embed = not_mask.cumsum(2, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(3, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale 41 | y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale 42 | x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale 43 | 44 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 45 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 46 | 47 | dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) 48 | dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) 49 | 50 | pos_x = x_embed[:, :, :, :, None] / dim_t 51 | pos_y = y_embed[:, :, :, :, None] / dim_t 52 | pos_z = z_embed[:, :, :, :, None] / dim_t_z 53 | pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 54 | pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 55 | pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 56 | pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w 57 | return pos 58 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /segmentation/mask2former_video/utils/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from contextlib import contextmanager 5 | from functools import wraps 6 | import torch 7 | from torch.cuda.amp import autocast 8 | 9 | __all__ = ["retry_if_cuda_oom"] 10 | 11 | 12 | @contextmanager 13 | def _ignore_torch_cuda_oom(): 14 | """ 15 | A context which ignores CUDA OOM exception from pytorch. 16 | """ 17 | try: 18 | yield 19 | except RuntimeError as e: 20 | # NOTE: the string may change? 21 | if "CUDA out of memory. " in str(e): 22 | pass 23 | else: 24 | raise 25 | 26 | 27 | def retry_if_cuda_oom(func): 28 | """ 29 | Makes a function retry itself after encountering 30 | pytorch's CUDA OOM error. 31 | It will first retry after calling `torch.cuda.empty_cache()`. 32 | If that still fails, it will then retry by trying to convert inputs to CPUs. 33 | In this case, it expects the function to dispatch to CPU implementation. 34 | The return values may become CPU tensors as well and it's user's 35 | responsibility to convert it back to CUDA tensor if needed. 36 | Args: 37 | func: a stateless callable that takes tensor-like objects as arguments 38 | Returns: 39 | a callable which retries `func` if OOM is encountered. 40 | Examples: 41 | :: 42 | output = retry_if_cuda_oom(some_torch_function)(input1, input2) 43 | # output may be on CPU even if inputs are on GPU 44 | Note: 45 | 1. When converting inputs to CPU, it will only look at each argument and check 46 | if it has `.device` and `.to` for conversion. Nested structures of tensors 47 | are not supported. 48 | 2. Since the function might be called more than once, it has to be 49 | stateless. 50 | """ 51 | 52 | def maybe_to_cpu(x): 53 | try: 54 | like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") 55 | except AttributeError: 56 | like_gpu_tensor = False 57 | if like_gpu_tensor: 58 | return x.to(device="cpu").to(torch.float32) 59 | else: 60 | return x 61 | 62 | @wraps(func) 63 | def wrapped(*args, **kwargs): 64 | with _ignore_torch_cuda_oom(): 65 | return func(*args, **kwargs) 66 | 67 | # Clear cache and retry 68 | torch.cuda.empty_cache() 69 | with _ignore_torch_cuda_oom(): 70 | return func(*args, **kwargs) 71 | 72 | # Try on CPU. This slows down the code significantly, therefore print a notice. 73 | logger = logging.getLogger(__name__) 74 | logger.info("Attempting to copy inputs to CPU due to CUDA OOM") 75 | new_args = (maybe_to_cpu(x) for x in args) 76 | new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} 77 | with autocast(enabled=False): 78 | return func(*new_args, **new_kwargs) 79 | 80 | return wrapped 81 | -------------------------------------------------------------------------------- /segmentation/predict.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "Mask2Former") 3 | import tempfile 4 | from pathlib import Path 5 | import numpy as np 6 | import cv2 7 | import cog 8 | 9 | # import some common detectron2 utilities 10 | from detectron2.config import CfgNode as CN 11 | from detectron2.engine import DefaultPredictor 12 | from detectron2.config import get_cfg 13 | from detectron2.utils.visualizer import Visualizer, ColorMode 14 | from detectron2.data import MetadataCatalog 15 | from detectron2.projects.deeplab import add_deeplab_config 16 | 17 | # import Mask2Former project 18 | from mask2former import add_maskformer2_config 19 | 20 | 21 | class Predictor(cog.Predictor): 22 | def setup(self): 23 | cfg = get_cfg() 24 | add_deeplab_config(cfg) 25 | add_maskformer2_config(cfg) 26 | cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml") 27 | cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl' 28 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 29 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True 30 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True 31 | self.predictor = DefaultPredictor(cfg) 32 | self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic") 33 | 34 | 35 | @cog.input( 36 | "image", 37 | type=Path, 38 | help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), " 39 | "instance segmentation (middle), and semantic segmentation (bottom).", 40 | ) 41 | def predict(self, image): 42 | im = cv2.imread(str(image)) 43 | outputs = self.predictor(im) 44 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 45 | panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), 46 | outputs["panoptic_seg"][1]).get_image() 47 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 48 | instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image() 49 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 50 | semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image() 51 | result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1] 52 | out_path = Path(tempfile.mkdtemp()) / "out.png" 53 | cv2.imwrite(str(out_path), result) 54 | return out_path 55 | -------------------------------------------------------------------------------- /segmentation/requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | -------------------------------------------------------------------------------- /segmentation/tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains few tools for MaskFormer. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | * `evaluate_pq_for_semantic_segmentation.py` 33 | 34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. 35 | 36 | Usage: 37 | 38 | ``` 39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json 40 | ``` 41 | 42 | where `OUTPUT_DIR` is set in the config file. 43 | 44 | * `evaluate_coco_boundary_ap.py` 45 | 46 | Tool to evaluate Boundary AP for instance segmentation predictions. 47 | 48 | Usage: 49 | 50 | ``` 51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON 52 | ``` 53 | 54 | To install Boundary IoU API, run: 55 | 56 | ``` 57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 58 | ``` 59 | 60 | * `analyze_model.py` 61 | 62 | Tool to analyze model parameters and flops. 63 | 64 | Usage for semantic segmentation (ADE20K only, use with caution!): 65 | 66 | ``` 67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 68 | ``` 69 | 70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 72 | 73 | Usage for panoptic and instance segmentation: 74 | 75 | ``` 76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 77 | ``` 78 | 79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 80 | -------------------------------------------------------------------------------- /segmentation/tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /segmentation/tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /segmentation/tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | --------------------------------------------------------------------------------