├── util ├── __init__.py ├── __pycache__ │ ├── logger.cpython-38.pyc │ ├── misc.cpython-38.pyc │ ├── misc.cpython-39.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── box_ops.cpython-38.pyc │ ├── box_ops.cpython-39.pyc │ └── visualization.cpython-38.pyc ├── box_ops.py ├── logger.py └── visualization.py ├── pre_process ├── __init__.py ├── __pycache__ │ ├── data.cpython-38.pyc │ ├── data.cpython-39.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── dense_crf.cpython-39.pyc │ ├── sim_model.cpython-38.pyc │ ├── sim_model.cpython-39.pyc │ ├── frozen_batchnorm2d.cpython-38.pyc │ ├── frozen_batchnorm2d.cpython-39.pyc │ └── generate_anno_a2d.cpython-39.pyc ├── dense_crf.py ├── frozen_batchnorm2d.py ├── data.py ├── generate_anno_ytvos.py └── sim_model.py ├── models ├── text_encoder │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── tokenizer.cpython-38.pyc │ │ └── text_encoder.cpython-38.pyc │ └── text_encoder.py ├── ops │ ├── MultiScaleDeformableAttention.egg-info │ │ ├── dependency_links.txt │ │ ├── top_level.txt │ │ ├── PKG-INFO │ │ └── SOURCES.txt │ ├── modules │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── ms_deform_attn.cpython-38.pyc │ │ │ └── ms_deform_attn.cpython-39.pyc │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── functions │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── ms_deform_attn_func.cpython-38.pyc │ │ │ └── ms_deform_attn_func.cpython-39.pyc │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── dist │ │ └── MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg │ ├── build │ │ ├── lib.linux-x86_64-cpython-38 │ │ │ ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ └── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ └── temp.linux-x86_64-cpython-38 │ │ │ └── media │ │ │ └── HardDisk_B │ │ │ └── Users │ │ │ └── wx │ │ │ └── wwk_files │ │ │ └── codes │ │ │ └── referring_video_segmentation │ │ │ └── sgmg_ablations │ │ │ └── models │ │ │ └── ops │ │ │ └── src │ │ │ ├── vision.o │ │ │ ├── cpu │ │ │ └── ms_deform_attn_cpu.o │ │ │ └── cuda │ │ │ └── ms_deform_attn_cuda.o │ ├── make.sh │ ├── src │ │ ├── vision.cpp │ │ ├── cuda │ │ │ └── ms_deform_attn_cuda.h │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.h │ │ │ └── ms_deform_attn_cpu.cpp │ │ └── ms_deform_attn.h │ ├── setup.py │ └── test.py ├── __pycache__ │ ├── ocpg.cpython-38.pyc │ ├── sgmg.cpython-38.pyc │ ├── sgmg.cpython-39.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── backbone.cpython-38.pyc │ ├── backbone.cpython-39.pyc │ ├── criterion.cpython-38.pyc │ ├── decoder.cpython-38.pyc │ ├── matcher.cpython-38.pyc │ ├── modules.cpython-38.pyc │ ├── postprocessors.cpython-38.pyc │ ├── segmentation.cpython-38.pyc │ ├── position_encoding.cpython-38.pyc │ ├── position_encoding.cpython-39.pyc │ ├── deformable_transformer.cpython-38.pyc │ ├── deformable_transformer.cpython-39.pyc │ └── video_swin_transformer.cpython-38.pyc ├── __init__.py ├── decoder.py ├── modules.py ├── backbone.py └── position_encoding.py ├── tools ├── colormap.py └── load_pretrained_weights.py ├── davis2017 ├── __init__.py ├── __pycache__ │ ├── davis.cpython-38.pyc │ ├── utils.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ ├── metrics.cpython-38.pyc │ ├── results.cpython-38.pyc │ └── evaluation.cpython-38.pyc ├── results.py ├── davis.py ├── evaluation.py └── utils.py ├── README.md ├── datasets ├── __pycache__ │ ├── a2d.cpython-38.pyc │ ├── a2d.cpython-39.pyc │ ├── davis.cpython-38.pyc │ ├── davis.cpython-39.pyc │ ├── jhmdb.cpython-38.pyc │ ├── jhmdb.cpython-39.pyc │ ├── refexp.cpython-38.pyc │ ├── refexp.cpython-39.pyc │ ├── ytvos.cpython-38.pyc │ ├── ytvos.cpython-39.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── a2d_eval.cpython-38.pyc │ ├── coco_eval.cpython-38.pyc │ ├── samplers.cpython-38.pyc │ ├── samplers.cpython-39.pyc │ ├── categories.cpython-38.pyc │ ├── categories.cpython-39.pyc │ ├── refexp2seq.cpython-38.pyc │ ├── refexp2seq.cpython-39.pyc │ ├── refexp_eval.cpython-38.pyc │ ├── concat_dataset.cpython-38.pyc │ ├── concat_dataset.cpython-39.pyc │ ├── transforms_image.cpython-38.pyc │ ├── transforms_image.cpython-39.pyc │ ├── transforms_video.cpython-38.pyc │ ├── transforms_video.cpython-39.pyc │ ├── image_to_seq_augmenter.cpython-38.pyc │ └── image_to_seq_augmenter.cpython-39.pyc ├── concat_dataset.py ├── __init__.py ├── refexp_eval.py ├── categories.py ├── a2d_eval.py ├── image_to_seq_augmenter.py ├── coco.py ├── samplers.py └── refexp.py ├── scripts ├── dist_test_ytvos_resnet101_boxsup.sh ├── dist_train_a2d_resnet101.sh ├── dist_train_a2d_resnet101_box.sh ├── dist_train_a2d_resnet101_boxsup.sh ├── dist_train_a2d_resnet101_pointsup.sh ├── dist_train_ytvos_resnet101.sh ├── dist_train_a2d_resnet101_freeze.sh ├── dist_test_ytvos_videoswinb.sh ├── dist_train_a2d_resnet101_boxvos.sh ├── dist_train_a2d_resnet101_boxlevelset.sh ├── dist_train_a2d_resnet101_boxinst_point.sh ├── dist_train_a2d_resnet101_pointsup_partialsup.sh ├── dist_train_a2d_resnet101_boxlevelset_point.sh ├── dist_test_a2d_resnet101.sh ├── dist_test_jhmdb_resnet101.sh ├── dist_train_scratch_ytvos_videoswin.sh ├── dist_train_ytvos_resnet101_boxsup.sh ├── dist_test_a2d_resnet101_boxsup.sh ├── dist_test_a2d_resnet101_pointsup.sh ├── dist_test_jhmdb_resnet101_boxsup.sh ├── dist_test_jhmdb_resnet101_pointsup.sh ├── dist_test_a2d_videoswint.sh ├── dist_train_a2d_videoswinb.sh ├── dist_train_ytvos_videoswin.sh ├── dist_train_ytvos_videoswinb.sh ├── dist_test_davis_resnet.sh └── dist_test_davis_videoswinb.sh ├── eval_davis.py └── utils.py /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pre_process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/text_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/colormap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /davis2017/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | __version__ = '0.1.0' 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OCPG 2 | Weakly Supervised Referring Video Object Segmentation with Object-Centric Pseudo-Guidance 3 | -------------------------------------------------------------------------------- /models/__pycache__/ocpg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/ocpg.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/sgmg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/sgmg.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/sgmg.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/sgmg.cpython-39.pyc -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | MultiScaleDeformableAttention 2 | functions 3 | modules 4 | -------------------------------------------------------------------------------- /util/__pycache__/logger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/logger.cpython-38.pyc -------------------------------------------------------------------------------- /util/__pycache__/misc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/misc.cpython-38.pyc -------------------------------------------------------------------------------- /util/__pycache__/misc.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/misc.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/a2d.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/a2d.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d.cpython-39.pyc -------------------------------------------------------------------------------- /util/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /util/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /util/__pycache__/box_ops.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/box_ops.cpython-38.pyc -------------------------------------------------------------------------------- /util/__pycache__/box_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/box_ops.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/davis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/davis.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/davis.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/davis.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/jhmdb.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/jhmdb.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/jhmdb.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/jhmdb.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/refexp.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/refexp.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/ytvos.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/ytvos.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/ytvos.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/ytvos.cpython-39.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/davis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/davis.cpython-38.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/backbone.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/backbone.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/backbone.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/criterion.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/criterion.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/decoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/decoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/matcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/matcher.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/data.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/data.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/data.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/a2d_eval.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d_eval.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/coco_eval.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/coco_eval.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/samplers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/samplers.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/samplers.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/samplers.cpython-39.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/metrics.cpython-38.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/results.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/results.cpython-38.pyc -------------------------------------------------------------------------------- /util/__pycache__/visualization.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/visualization.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/categories.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/categories.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/categories.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/categories.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/refexp2seq.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp2seq.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/refexp2seq.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp2seq.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/refexp_eval.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp_eval.cpython-38.pyc -------------------------------------------------------------------------------- /davis2017/__pycache__/evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/postprocessors.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/postprocessors.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/segmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/segmentation.cpython-38.pyc -------------------------------------------------------------------------------- /models/text_encoder/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /pre_process/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/dense_crf.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/dense_crf.cpython-39.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/sim_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/sim_model.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/sim_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/sim_model.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/concat_dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/concat_dataset.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/concat_dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/concat_dataset.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/transforms_image.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_image.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/transforms_image.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_image.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/transforms_video.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_video.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/transforms_video.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_video.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/position_encoding.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/position_encoding.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/position_encoding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/position_encoding.cpython-39.pyc -------------------------------------------------------------------------------- /models/ops/modules/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/modules/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /models/text_encoder/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/image_to_seq_augmenter.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/image_to_seq_augmenter.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/image_to_seq_augmenter.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/image_to_seq_augmenter.cpython-39.pyc -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ocpg import build 2 | 3 | def build_model(args): 4 | print("\n **** BUILD MODEL FOR OCPG. **** \n") 5 | return build(args) 6 | -------------------------------------------------------------------------------- /models/__pycache__/deformable_transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/deformable_transformer.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/deformable_transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/deformable_transformer.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/video_swin_transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/video_swin_transformer.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/functions/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/functions/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /models/text_encoder/__pycache__/tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/frozen_batchnorm2d.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/frozen_batchnorm2d.cpython-38.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/frozen_batchnorm2d.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/frozen_batchnorm2d.cpython-39.pyc -------------------------------------------------------------------------------- /pre_process/__pycache__/generate_anno_a2d.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/generate_anno_a2d.cpython-39.pyc -------------------------------------------------------------------------------- /models/ops/modules/__pycache__/ms_deform_attn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/ms_deform_attn.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/modules/__pycache__/ms_deform_attn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/ms_deform_attn.cpython-39.pyc -------------------------------------------------------------------------------- /models/text_encoder/__pycache__/text_encoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/text_encoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc -------------------------------------------------------------------------------- /models/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc -------------------------------------------------------------------------------- /models/ops/dist/MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-38/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/lib.linux-x86_64-cpython-38/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: MultiScaleDeformableAttention 3 | Version: 1.0 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR 6 | Author: Weijie Su 7 | -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.o -------------------------------------------------------------------------------- /models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.o -------------------------------------------------------------------------------- /tools/load_pretrained_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def pre_trained_model_to_finetune(checkpoint, args): 4 | checkpoint = checkpoint['model'] 5 | # only delete the class_embed since the finetuned dataset has different num_classes 6 | num_layers = args.dec_layers + 1 if args.two_stage else args.dec_layers 7 | for l in range(num_layers): 8 | del checkpoint["class_embed.{}.weight".format(l)] 9 | del checkpoint["class_embed.{}.bias".format(l)] 10 | 11 | return checkpoint 12 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-38/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-38/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.cpp 3 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.cpp 4 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.cu 5 | MultiScaleDeformableAttention.egg-info/PKG-INFO 6 | MultiScaleDeformableAttention.egg-info/SOURCES.txt 7 | MultiScaleDeformableAttention.egg-info/dependency_links.txt 8 | MultiScaleDeformableAttention.egg-info/top_level.txt 9 | functions/__init__.py 10 | functions/ms_deform_attn_func.py 11 | modules/__init__.py 12 | modules/ms_deform_attn.py -------------------------------------------------------------------------------- /scripts/dist_test_ytvos_resnet101_boxsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | GPUS_PER_NODE=1 6 | CPUS_PER_TASK=6 7 | PORT=29500 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval_ytvos_boxsup" 17 | CHECKPOINT="./results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0009.pth" 18 | python inference_ytvos.py --with_box_refine --binary --freeze_text_encoder \ 19 | --eval \ 20 | --ngpu=${GPUS_PER_NODE} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --resume=${CHECKPOINT} \ 23 | --backbone=${BACKBONE} 24 | 25 | 26 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_box.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/OCPG_${BACKBONE}_scratch_a2d_boxsup" 16 | EXP_NAME="OCPG_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=box \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_boxsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=box \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | 26 | 27 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_pointsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=point \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/dist_train_ytvos_resnet101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25500 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth" 16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_ytvos" 17 | EXP_NAME="SgMg_${BACKBONE}_scratch" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --exp_name=${EXP_NAME} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file ytvos \ 24 | --amp -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_freeze.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/results/SgMg_${BACKBONE}_scratch_a2d_freeze" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --freeze_video_encoder \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 4 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | 26 | 27 | -------------------------------------------------------------------------------- /scripts/dist_test_ytvos_videoswinb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | GPUS_PER_NODE=2 7 | CPUS_PER_TASK=6 8 | PORT=29500 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_b_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_eval" 18 | CHECKPOINT="./checkpoints/sgmg_videosiwnb_ytvos.pth" 19 | python inference_ytvos.py --with_box_refine --binary --freeze_text_encoder \ 20 | --eval \ 21 | --ngpu=${GPUS_PER_NODE} \ 22 | --output_dir=${OUTPUT_DIR} \ 23 | --resume=${CHECKPOINT} \ 24 | --backbone=${BACKBONE} \ 25 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 26 | --amp \ 27 | 28 | 29 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_boxvos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxvos" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=box \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | --supervision_type boxvos 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_boxlevelset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxlevelset" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=box \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | --supervision_type boxlevelset 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_boxinst_point.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxinst_point" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=point \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | --supervision_type boxinst 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_pointsup_partialsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | PORT=25501 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup_partialsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=point \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | --supervision_type partialsup 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_resnet101_boxlevelset_point.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxlevelset_point" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 18 | --with_box_refine --binary --freeze_text_encoder --supervision=point \ 19 | --exp_name=${EXP_NAME} \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --backbone=${BACKBONE} \ 22 | --dataset_file a2d \ 23 | --batch_size 2 \ 24 | --epochs 12 --lr_drop 3 5 \ 25 | --supervision_type boxlevelset 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/dist_test_a2d_resnet101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d/checkpoint0011.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file a2d \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_test_jhmdb_resnet101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d/checkpoint0011.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file jhmdb \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_train_scratch_ytvos_videoswin.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | PORT=25500 7 | GPUS_PER_NODE=2 8 | CPUS_PER_TASK=6 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_t_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth" 17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_scratch" 18 | EXP_NAME="SgMg_${BACKBONE}_scratch" 19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 20 | --with_box_refine --binary --freeze_text_encoder \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --exp_name=${EXP_NAME} \ 23 | --backbone=${BACKBONE} \ 24 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 25 | --dataset_file ytvos \ 26 | --amp 27 | -------------------------------------------------------------------------------- /scripts/dist_train_ytvos_resnet101_boxsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=2023 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth" 16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_ytvos_boxsup" 17 | EXP_NAME="SgMg_${BACKBONE}_scratch" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder --supervision=box \ 20 | --output_dir=${OUTPUT_DIR} \ 21 | --exp_name=${EXP_NAME} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file ytvos \ 24 | # --resume results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0000.pth -------------------------------------------------------------------------------- /scripts/dist_test_a2d_resnet101_boxsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | PORT=29501 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval_scratch_a2d_boxsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d_boxsup_boxlevelset/checkpoint0009.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder --supervision box \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file a2d \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_test_a2d_resnet101_pointsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | PORT=25505 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d_pointsup_partialsup/checkpoint0001.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder --supervision point \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file a2d \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_test_jhmdb_resnet101_boxsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | PORT=25509 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_jhmdb_boxsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d_boxsup_boxlevelset/checkpoint0010.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder --supervision box \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file jhmdb \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_test_jhmdb_resnet101_pointsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='1' 5 | PORT=25505 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_jhmdb_pointsup" 16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d" 17 | RESUME="results/SgMg_resnet101_scratch_a2d_pointsup_partialsup/checkpoint0001.pth" 18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 19 | --with_box_refine --binary --freeze_text_encoder --supervision point \ 20 | --exp_name=${EXP_NAME} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --backbone=${BACKBONE} \ 23 | --dataset_file jhmdb \ 24 | --batch_size 4 \ 25 | --epochs 12 --lr_drop 3 5 \ 26 | --eval \ 27 | --resume=${RESUME} 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/dist_test_a2d_videoswint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | PORT=25503 6 | GPUS_PER_NODE=1 7 | CPUS_PER_TASK=6 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="video_swin_t_p4w7" 15 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 16 | OUTPUT_DIR="./results/results/SgMg_${BACKBONE}_finetune_a2d" 17 | EXP_NAME="SgMg_${BACKBONE}_finetune_a2d" 18 | PRETRAINED_WEIGHTS="checkpoints/sgmg_videoswint_a2d.pth" 19 | TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 20 | --with_box_refine --binary --freeze_text_encoder \ 21 | --exp_name=${EXP_NAME} \ 22 | --output_dir=${OUTPUT_DIR} \ 23 | --backbone=${BACKBONE} \ 24 | --dataset_file a2d \ 25 | --batch_size 2 \ 26 | --epochs 6 --lr_drop 3 5 \ 27 | --pretrained_weights=${PRETRAINED_WEIGHTS} \ 28 | --eval \ 29 | 30 | 31 | -------------------------------------------------------------------------------- /scripts/dist_train_a2d_videoswinb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | PORT=25503 7 | GPUS_PER_NODE=2 8 | CPUS_PER_TASK=6 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_b_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_finetune_a2d" 18 | EXP_NAME="SgMg_${BACKBONE}_finetune_a2d" 19 | PRETRAINED_WEIGHTS="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 20 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 21 | --with_box_refine --binary --freeze_text_encoder \ 22 | --exp_name=${EXP_NAME} \ 23 | --output_dir=${OUTPUT_DIR} \ 24 | --backbone=${BACKBONE} \ 25 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 26 | --dataset_file a2d \ 27 | --batch_size 2 \ 28 | --epochs 6 --lr_drop 3 5 \ 29 | --pretrained_weights=${PRETRAINED_WEIGHTS} \ 30 | --use_checkpoint \ 31 | 32 | 33 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /davis2017/results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from PIL import Image 4 | import sys 5 | 6 | 7 | class Results(object): 8 | def __init__(self, root_dir): 9 | self.root_dir = root_dir 10 | 11 | def _read_mask(self, sequence, frame_id): 12 | try: 13 | mask_path = os.path.join(self.root_dir, sequence, f'{frame_id}.png') 14 | return np.array(Image.open(mask_path)) 15 | except IOError as err: 16 | sys.stdout.write(sequence + " frame %s not found!\n" % frame_id) 17 | sys.stdout.write("The frames have to be indexed PNG files placed inside the corespondent sequence " 18 | "folder.\nThe indexes have to match with the initial frame.\n") 19 | sys.stderr.write("IOError: " + err.strerror + "\n") 20 | sys.exit() 21 | 22 | def read_masks(self, sequence, masks_id): 23 | mask_0 = self._read_mask(sequence, masks_id[0]) 24 | masks = np.zeros((len(masks_id), *mask_0.shape)) 25 | for ii, m in enumerate(masks_id): 26 | masks[ii, ...] = self._read_mask(sequence, m) 27 | num_objects = int(np.max(masks)) 28 | tmp = np.ones((num_objects, *masks.shape)) 29 | tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None] 30 | masks = (tmp == masks[None, ...]) > 0 31 | return masks 32 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /scripts/dist_train_ytvos_videoswin.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | PORT=25500 7 | GPUS_PER_NODE=2 8 | CPUS_PER_TASK=6 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_t_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth" 17 | OUTPUT_DIR1="./checkpoints/results/SgMg_${BACKBONE}_pretrain" 18 | EXP_NAME1="SgMg_${BACKBONE}_pretrain" 19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main_pretrain.py \ 20 | --dataset_file all \ 21 | --with_box_refine --binary \ 22 | --output_dir=${OUTPUT_DIR1} \ 23 | --exp_name=${EXP_NAME1} \ 24 | --backbone=${BACKBONE} \ 25 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 26 | --batch_size 2 \ 27 | --num_frames 1 \ 28 | --epochs 11 --lr_drop 8 10 \ 29 | 30 | 31 | OUTPUT_DIR2="./checkpoints/results/SgMg_${BACKBONE}_finetune" 32 | EXP_NAME2="SgMg_${BACKBONE}_finetune" 33 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 34 | --with_box_refine --binary --freeze_text_encoder \ 35 | --output_dir=${OUTPUT_DIR2} \ 36 | --exp_name=${EXP_NAME2} \ 37 | --backbone=${BACKBONE} \ 38 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 39 | --epochs 6 --lr_drop 3 5 \ 40 | --dataset_file ytvos \ 41 | --pretrained_weights ${OUTPUT_DIR1}"/checkpoint0010.pth" \ 42 | -------------------------------------------------------------------------------- /scripts/dist_train_ytvos_videoswinb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | PORT=25501 7 | GPUS_PER_NODE=2 8 | CPUS_PER_TASK=6 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_b_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 17 | OUTPUT_DIR1="./checkpoints/results/SgMg_${BACKBONE}_pretrain" 18 | EXP_NAME1="SgMg_${BACKBONE}_pretrain" 19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main_pretrain.py \ 20 | --dataset_file all \ 21 | --with_box_refine --binary \ 22 | --output_dir=${OUTPUT_DIR1} \ 23 | --exp_name=${EXP_NAME1} \ 24 | --backbone=${BACKBONE} \ 25 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 26 | --batch_size 2 \ 27 | --num_frames 1 \ 28 | --epochs 11 --lr_drop 8 10 \ 29 | 30 | 31 | OUTPUT_DIR2="./checkpoints/results/SgMg_${BACKBONE}_finetune" 32 | EXP_NAME2="SgMg_${BACKBONE}_finetune" 33 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT} --nproc_per_node=${GPUS_PER_NODE} main.py \ 34 | --with_box_refine --binary --freeze_text_encoder \ 35 | --output_dir=${OUTPUT_DIR2} \ 36 | --exp_name=${EXP_NAME2} \ 37 | --backbone=${BACKBONE} \ 38 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 39 | --epochs 6 --lr_drop 3 5 \ 40 | --dataset_file ytvos \ 41 | --pretrained_weights ${OUTPUT_DIR1}"/checkpoint0010.pth" \ 42 | 43 | -------------------------------------------------------------------------------- /pre_process/dense_crf.py: -------------------------------------------------------------------------------- 1 | import pydensecrf.densecrf as dcrf 2 | import numpy as np 3 | from pydensecrf.utils import unary_from_labels 4 | 5 | 6 | def sigmoid(x): 7 | return 1 / (1 + np.exp(-x)) 8 | 9 | 10 | def apply_dense_crf(img, mask): 11 | EPSILON = 1e-8 12 | M = 2 # salient or not 13 | tau = 1.05 14 | # Setup the CRF model 15 | d = dcrf.DenseCRF2D(img.shape[1], img.shape[0], M) 16 | anno_norm = mask / 255.0 17 | n_energy = -np.log((1.0 - anno_norm + EPSILON)) / (tau * sigmoid(1 - anno_norm)) 18 | p_energy = -np.log(anno_norm + EPSILON) / (tau * sigmoid(anno_norm)) 19 | 20 | U = np.zeros((M, img.shape[0] * img.shape[1]), dtype="float32") 21 | U[0, :] = n_energy.flatten() 22 | U[1, :] = p_energy.flatten() 23 | 24 | d.setUnaryEnergy(U) 25 | 26 | d.addPairwiseGaussian(sxy=3, compat=3) 27 | # d.addPairwiseBilateral(sxy=60, srgb=5, rgbim=img, compat=5) 28 | d.addPairwiseBilateral(sxy=(80, 80), srgb=(13, 13, 13), rgbim=img, compat=10) 29 | 30 | # Do the inference 31 | infer = np.array(d.inference(4)).astype("float32") 32 | res = infer[1, :] 33 | 34 | res = res * 255 35 | res = res.reshape(img.shape[:2]).astype("uint8") 36 | return res 37 | 38 | 39 | def crf_inference_label(img, labels, t=10, n_labels=21, gt_prob=0.7): 40 | h, w = img.shape[:2] 41 | 42 | d = dcrf.DenseCRF2D(w, h, n_labels) 43 | 44 | unary = unary_from_labels(labels, n_labels, gt_prob=gt_prob, zero_unsure=False) 45 | 46 | d.setUnaryEnergy(unary) 47 | d.addPairwiseGaussian(sxy=3, compat=3) 48 | d.addPairwiseBilateral( 49 | sxy=50, srgb=5, rgbim=np.ascontiguousarray(np.copy(img)), compat=10 50 | ) 51 | 52 | q = d.inference(t) 53 | 54 | return np.argmax(np.array(q).reshape((n_labels, h, w)), axis=0) 55 | -------------------------------------------------------------------------------- /scripts/dist_test_davis_resnet.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | GPUS='0' 5 | GPUS_PER_NODE=1 6 | CPUS_PER_TASK=6 7 | PORT=29500 8 | export CUDA_VISIBLE_DEVICES=${GPUS} 9 | echo "using gpus ${GPUS}, master port ${PORT}." 10 | now=$(date +"%T") 11 | echo "Current time : $now" 12 | echo "Current path : $PWD" 13 | 14 | BACKBONE="resnet101" 15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval" 17 | CHECKPOINT="./results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0009.pth" 18 | python inference_davis.py --with_box_refine --binary --freeze_text_encoder \ 19 | --eval \ 20 | --ngpu=${GPUS_PER_NODE} \ 21 | --output_dir=${OUTPUT_DIR} \ 22 | --resume=${CHECKPOINT} \ 23 | --backbone=${BACKBONE} \ 24 | --backbone_pretrained=${BACKBONE_PRETRAINED} 25 | 26 | 27 | # evaluation 28 | ANNO0_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_0" 29 | ANNO1_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_1" 30 | ANNO2_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_2" 31 | ANNO3_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_3" 32 | echo "Annotations store at : ${ANNO0_DIR}" 33 | rm ${ANNO0_DIR}"/global_results-val.csv" 34 | rm ${ANNO0_DIR}"/per-sequence_results-val.csv" 35 | rm ${ANNO1_DIR}"/global_results-val.csv" 36 | rm ${ANNO1_DIR}"/per-sequence_results-val.csv" 37 | rm ${ANNO2_DIR}"/global_results-val.csv" 38 | rm ${ANNO2_DIR}"/per-sequence_results-val.csv" 39 | rm ${ANNO3_DIR}"/global_results-val.csv" 40 | rm ${ANNO3_DIR}"/per-sequence_results-val.csv" 41 | 42 | python3 eval_davis.py --results_path=${ANNO0_DIR} 43 | python3 eval_davis.py --results_path=${ANNO1_DIR} 44 | python3 eval_davis.py --results_path=${ANNO2_DIR} 45 | python3 eval_davis.py --results_path=${ANNO3_DIR} 46 | 47 | echo "Working path is: ${OUTPUT_DIR}" 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /scripts/dist_test_davis_videoswinb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | cd .. 4 | 5 | GPUS='0,1' 6 | GPUS_PER_NODE=2 7 | CPUS_PER_TASK=6 8 | PORT=29500 9 | export CUDA_VISIBLE_DEVICES=${GPUS} 10 | echo "using gpus ${GPUS}, master port ${PORT}." 11 | now=$(date +"%T") 12 | echo "Current time : $now" 13 | echo "Current path : $PWD" 14 | 15 | BACKBONE="video_swin_b_p4w7" 16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth" 17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_eval" 18 | CHECKPOINT="./checkpoints/sgmg_videosiwnb_ytvos.pth" 19 | python inference_davis.py --with_box_refine --binary --freeze_text_encoder \ 20 | --eval \ 21 | --ngpu=${GPUS_PER_NODE} \ 22 | --output_dir=${OUTPUT_DIR} \ 23 | --resume=${CHECKPOINT} \ 24 | --backbone=${BACKBONE} \ 25 | --backbone_pretrained=${BACKBONE_PRETRAINED} \ 26 | --amp \ 27 | 28 | 29 | # evaluation 30 | ANNO0_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_0" 31 | ANNO1_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_1" 32 | ANNO2_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_2" 33 | ANNO3_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_3" 34 | echo "Annotations store at : ${ANNO0_DIR}" 35 | rm ${ANNO0_DIR}"/global_results-val.csv" 36 | rm ${ANNO0_DIR}"/per-sequence_results-val.csv" 37 | rm ${ANNO1_DIR}"/global_results-val.csv" 38 | rm ${ANNO1_DIR}"/per-sequence_results-val.csv" 39 | rm ${ANNO2_DIR}"/global_results-val.csv" 40 | rm ${ANNO2_DIR}"/per-sequence_results-val.csv" 41 | rm ${ANNO3_DIR}"/global_results-val.csv" 42 | rm ${ANNO3_DIR}"/per-sequence_results-val.csv" 43 | 44 | python3 eval_davis.py --results_path=${ANNO0_DIR} 45 | python3 eval_davis.py --results_path=${ANNO1_DIR} 46 | python3 eval_davis.py --results_path=${ANNO2_DIR} 47 | python3 eval_davis.py --results_path=${ANNO3_DIR} 48 | 49 | echo "Working path is: ${OUTPUT_DIR}" 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /pre_process/frozen_batchnorm2d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class FrozenBatchNorm2d(torch.nn.Module): 5 | """ 6 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 7 | 8 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 9 | without which any other models than torchvision.models.resnet[18,34,50,101] 10 | produce nans. 11 | """ 12 | 13 | def __init__(self, n): 14 | super(FrozenBatchNorm2d, self).__init__() 15 | self.register_buffer("weight", torch.ones(n)) 16 | self.register_buffer("bias", torch.zeros(n)) 17 | self.register_buffer("running_mean", torch.zeros(n)) 18 | self.register_buffer("running_var", torch.ones(n)) 19 | 20 | def _load_from_state_dict( 21 | self, 22 | state_dict, 23 | prefix, 24 | local_metadata, 25 | strict, 26 | missing_keys, 27 | unexpected_keys, 28 | error_msgs, 29 | ): 30 | num_batches_tracked_key = prefix + "num_batches_tracked" 31 | if num_batches_tracked_key in state_dict: 32 | del state_dict[num_batches_tracked_key] 33 | 34 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 35 | state_dict, 36 | prefix, 37 | local_metadata, 38 | strict, 39 | missing_keys, 40 | unexpected_keys, 41 | error_msgs, 42 | ) 43 | 44 | def forward(self, x): 45 | # move reshapes to the beginning 46 | # to make it fuser-friendly 47 | w = self.weight.reshape(1, -1, 1, 1) 48 | b = self.bias.reshape(1, -1, 1, 1) 49 | rv = self.running_var.reshape(1, -1, 1, 1) 50 | rm = self.running_mean.reshape(1, -1, 1, 1) 51 | eps = 1e-5 52 | scale = w * (rv + eps).rsqrt() 53 | bias = b - rm * scale 54 | return x * scale + bias 55 | -------------------------------------------------------------------------------- /models/decoder.py: -------------------------------------------------------------------------------- 1 | 2 | import copy 3 | from typing import Optional, List 4 | import math 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import nn, Tensor 9 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ 10 | from util.misc import inverse_sigmoid 11 | from einops import rearrange 12 | 13 | 14 | class MSO(nn.Module): 15 | def __init__(self, mask_dim=16, img_dim=[96, 192], out_dim=16): 16 | super().__init__() 17 | 18 | self.mask_dim = mask_dim 19 | self.img_dim = img_dim 20 | self.out_dim = out_dim 21 | 22 | self.conv1_1div8 = nn.Conv2d(mask_dim+img_dim[1], mask_dim, kernel_size=3, padding=1) 23 | self.conv2_1div8 = nn.Conv2d(mask_dim, mask_dim, kernel_size=3, padding=1) 24 | 25 | self.conv1_1div4 = nn.Conv2d(mask_dim + img_dim[0], mask_dim, kernel_size=3, padding=1) 26 | self.conv2_1div4 = nn.Conv2d(mask_dim, mask_dim, kernel_size=3, padding=1) 27 | self.out_conv = nn.Conv2d(mask_dim, 1, kernel_size=3, padding=1) 28 | 29 | 30 | # TODO: add image on channel. deconv to upsample 31 | def forward(self, pred_masks, image_features): 32 | image_features = [x.tensors for x in image_features] # 1/4 & 1/8 33 | 34 | # merge with 1/8 image 35 | assert pred_masks.shape[-1] == image_features[-1].shape[-1], "First size wrong." 36 | x = torch.cat([pred_masks, image_features[-1]], dim=1) 37 | pred_masks += self.conv2_1div8(F.relu(self.conv1_1div8(F.relu(x)))) 38 | 39 | # merge with 1/4 image 40 | pred_masks = F.interpolate(pred_masks, size=(image_features[-2].shape[-2], image_features[-2].shape[-1]), mode='bilinear', align_corners=False) 41 | assert pred_masks.shape[-1] == image_features[-2].shape[-1], "Second size wrong." 42 | x = torch.cat([pred_masks, image_features[-2]], dim=1) 43 | pred_masks += self.conv2_1div4(F.relu(self.conv1_1div4(F.relu(x)))) 44 | 45 | pred_masks = self.out_conv(pred_masks) 46 | 47 | return pred_masks 48 | 49 | -------------------------------------------------------------------------------- /datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.utils.data 10 | 11 | from torch.utils.data import Dataset, ConcatDataset 12 | from .refexp2seq import build as build_seq_refexp 13 | from .ytvos import build as build_ytvs 14 | from .davis import build as build_davis 15 | from datasets import ytvos 16 | 17 | 18 | # join ref coco and ytvos 19 | def build(image_set, args): 20 | concat_data = [] 21 | 22 | print('preparing coco2seq dataset ....') 23 | coco_names = ["refcoco", "refcoco+", "refcocog"] 24 | for name in coco_names: 25 | coco_seq = build_seq_refexp(name, image_set, args) 26 | concat_data.append(coco_seq) 27 | 28 | print('preparing ytvos dataset .... ') 29 | ytvos_dataset = build_ytvs(image_set, args) 30 | concat_data.append(ytvos_dataset) 31 | 32 | concat_data = ConcatDataset(concat_data) 33 | 34 | return concat_data 35 | 36 | def build_coco(image_set, args): 37 | concat_data = [] 38 | 39 | print('preparing coco2seq dataset ....') 40 | coco_names = ["refcoco", "refcoco+", "refcocog"] 41 | for name in coco_names: 42 | coco_seq = build_seq_refexp(name, image_set, args) 43 | concat_data.append(coco_seq) 44 | 45 | concat_data = ConcatDataset(concat_data) 46 | return concat_data 47 | 48 | def build_joint_ytb_dvs(image_set, args): 49 | concat_data = [] 50 | 51 | print('preparing davis dataset ....') 52 | dvs_dataset = build_davis(image_set, args) 53 | for i in range(5): 54 | concat_data.append(dvs_dataset) 55 | 56 | print('preparing ytvos dataset .... ') 57 | ytvos_dataset = build_ytvs(image_set, args) 58 | concat_data.append(ytvos_dataset) 59 | 60 | concat_data = ConcatDataset(concat_data) 61 | 62 | return concat_data 63 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /pre_process/data.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | from PIL import Image 4 | from torchvision import transforms 5 | import numpy as np 6 | 7 | 8 | def img_transform(img, annos): 9 | transform = transforms.Compose( 10 | [ 11 | transforms.ToTensor(), 12 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 13 | ] 14 | ) 15 | h, w, _ = img.shape 16 | centers = [] 17 | centers_norm = [] 18 | label_list = list(np.unique(annos)) 19 | label_list.remove(0) 20 | for label in label_list: 21 | anno = (annos == label).astype(np.uint8) * 255 22 | dist = cv2.distanceTransform(anno, cv2.DIST_L2, 5, cv2.DIST_LABEL_PIXEL) 23 | _, _, _, center = cv2.minMaxLoc(dist) 24 | center_norm = (center[0] / w, center[1] / h) 25 | centers.append(center) 26 | centers_norm.append(center_norm) 27 | img = Image.fromarray(img) 28 | img = transform(img) 29 | return img, centers, centers_norm 30 | 31 | 32 | def load_img_davis(img_path, anno_path): 33 | imgs = os.listdir(img_path) 34 | out_pairs = {} 35 | for img in imgs: 36 | out_pairs[img.split(".")[0]] = {} 37 | frame = cv2.imread(os.path.join(img_path, img)) 38 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 39 | out_pairs[img.split(".")[0]]["frame"] = frame 40 | anno = cv2.imread( 41 | os.path.join( 42 | anno_path, 43 | img.split(".")[0] + ".png", 44 | ) 45 | ) 46 | out_pairs[img.split(".")[0]]["label"] = anno 47 | return out_pairs 48 | 49 | 50 | def load_video_a2d(video_path, anno_path): 51 | annos = os.listdir(anno_path) 52 | out_pairs = {} 53 | 54 | for anno in annos: 55 | out_pairs[str(int(anno.split(".")[0]))] = {} 56 | ann_img = cv2.imread(os.path.join(anno_path, anno)) 57 | out_pairs[str(int(anno.split(".")[0]))]["label"] = ann_img 58 | 59 | cap = cv2.VideoCapture(video_path) 60 | idx = 0 61 | while True: 62 | ret = cap.grab() 63 | if not ret: 64 | break 65 | if str(idx + 1) in out_pairs.keys(): 66 | ret, frame = cap.retrieve() 67 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 68 | out_pairs[str(idx + 1)]["frame"] = frame 69 | idx += 1 70 | cap.release() 71 | return out_pairs 72 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | import torchvision 3 | 4 | from .ytvos import build as build_ytvos 5 | from .davis import build as build_davis 6 | from .a2d import build as build_a2d 7 | from .jhmdb import build as build_jhmdb 8 | from .refexp import build as build_refexp 9 | from .concat_dataset import build as build_joint 10 | from .concat_dataset import build_coco as build_joint_coco 11 | from .concat_dataset import build_joint_ytb_dvs 12 | 13 | def get_coco_api_from_dataset(dataset): 14 | for _ in range(10): 15 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 16 | # break 17 | if isinstance(dataset, torch.utils.data.Subset): 18 | dataset = dataset.dataset 19 | if isinstance(dataset, torchvision.datasets.CocoDetection): 20 | return dataset.coco 21 | 22 | 23 | def build_dataset(dataset_file: str, image_set: str, args): 24 | if dataset_file == 'ytvos': 25 | print("\n **** Start to build dataset {}. **** \n".format("build_ytvos")) 26 | return build_ytvos(image_set, args) 27 | if dataset_file == 'davis': 28 | print("\n **** Start to build dataset {}. **** \n".format("build_davis")) 29 | return build_davis(image_set, args) 30 | if dataset_file == 'a2d': 31 | print("\n **** Start to build dataset {}. **** \n".format("build_a2d")) 32 | return build_a2d(image_set, args) 33 | if dataset_file == 'jhmdb': 34 | print("\n **** Start to build dataset {}. **** \n".format("build_jhmdb")) 35 | return build_jhmdb(image_set, args) 36 | # for pretraining 37 | if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog": 38 | print("\n **** Start to build dataset {}. **** \n".format("build_refexp")) 39 | return build_refexp(dataset_file, image_set, args) 40 | 41 | # for joint training of refcoco and ytvos, not used. 42 | if dataset_file == 'joint': 43 | print("\n **** Start to build dataset {}. **** \n".format("build_joint")) 44 | return build_joint(image_set, args) 45 | if dataset_file == 'joint_coco': 46 | print("\n **** Start to build dataset {}. **** \n".format("build_joint_coco")) 47 | return build_joint_coco(image_set, args) 48 | if dataset_file == 'ytvos_joint_davis': 49 | print("\n **** Start to build dataset {}. **** \n".format("build_joint_ytb_dvs")) 50 | return build_joint_ytb_dvs(image_set, args) 51 | raise ValueError(f'dataset {dataset_file} not supported') 52 | -------------------------------------------------------------------------------- /models/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from typing import Optional, List 5 | from torch import Tensor 6 | from einops import rearrange 7 | 8 | 9 | class LFMResizeAdaptive(nn.Module): 10 | def __init__(self, num_channels, sigma): 11 | super(LFMResizeAdaptive, self).__init__() 12 | self.conv1 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0) 13 | self.conv2 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0) 14 | self.sigma = sigma 15 | 16 | self.laplace = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=0) 17 | self.pool = nn.AdaptiveAvgPool2d(1) 18 | self.fc = nn.Sequential( 19 | nn.Linear(num_channels, num_channels, bias=False), 20 | nn.ReLU(inplace=True), 21 | nn.Linear(num_channels, 1, bias=False), 22 | nn.Sigmoid() 23 | ) 24 | 25 | def make_gaussian(self, y_idx, x_idx, height, width, sigma=7, device='cpu'): 26 | yv, xv = torch.meshgrid([torch.arange(0, height), torch.arange(0, width)]) 27 | 28 | yv = yv.unsqueeze(0).float().to(device) 29 | xv = xv.unsqueeze(0).float().to(device) 30 | g = torch.exp(- ((yv - y_idx) ** 2 + (xv - x_idx) ** 2) / (2 * sigma ** 2)) 31 | return g.unsqueeze(0) 32 | 33 | def forward(self, x, gauss_map=None): 34 | b, c, h, w = x.shape 35 | x = x.float() 36 | 37 | # compute coef for gaussian 0~1 38 | coef = self.laplace(x) 39 | coef = self.fc(self.pool(coef).view(b, c)).view(b, 1, 1, 1) 40 | 41 | y = torch.fft.fft2(x) 42 | 43 | h_idx, w_idx = h // 2, w // 2 44 | if gauss_map is None: 45 | high_filter = self.make_gaussian(h_idx, w_idx, h, w, self.sigma, device=x.device) 46 | else: 47 | high_filter = F.interpolate(gauss_map, size=(h, w), mode='bilinear', align_corners=False) 48 | 49 | y = y * (1 - coef * high_filter) 50 | 51 | y_imag = y.imag 52 | y_real = y.real 53 | y_f = torch.cat([y_real, y_imag], dim=1) 54 | y = F.relu(self.conv1(y_f)) 55 | 56 | y = self.conv2(y).float() 57 | y_real, y_imag = torch.chunk(y, 2, dim=1) 58 | y = torch.complex(y_real, y_imag) 59 | 60 | y = torch.fft.ifft2(y, s=(h, w)).float() 61 | return x + y, high_filter 62 | 63 | 64 | if __name__ == "__main__": 65 | model = LFMResizeAdaptive(256, 3) 66 | data = torch.rand(2,256,8,8) 67 | res = model(data) 68 | -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-38/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /datasets/refexp_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved 2 | import copy 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | import torch 7 | import torch.utils.data 8 | 9 | import util.misc as utils 10 | from util.box_ops import generalized_box_iou 11 | 12 | 13 | class RefExpEvaluator(object): 14 | def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5): 15 | assert isinstance(k, (list, tuple)) 16 | refexp_gt = copy.deepcopy(refexp_gt) 17 | self.refexp_gt = refexp_gt 18 | self.iou_types = iou_types 19 | self.img_ids = self.refexp_gt.imgs.keys() 20 | self.predictions = {} 21 | self.k = k 22 | self.thresh_iou = thresh_iou 23 | 24 | def accumulate(self): 25 | pass 26 | 27 | def update(self, predictions): 28 | self.predictions.update(predictions) 29 | 30 | def synchronize_between_processes(self): 31 | all_predictions = utils.all_gather(self.predictions) 32 | merged_predictions = {} 33 | for p in all_predictions: 34 | merged_predictions.update(p) 35 | self.predictions = merged_predictions 36 | 37 | def summarize(self): 38 | if utils.is_main_process(): 39 | dataset2score = { 40 | "refcoco": {k: 0.0 for k in self.k}, 41 | "refcoco+": {k: 0.0 for k in self.k}, 42 | "refcocog": {k: 0.0 for k in self.k}, 43 | } 44 | dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0} 45 | for image_id in self.img_ids: 46 | ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id) 47 | assert len(ann_ids) == 1 48 | img_info = self.refexp_gt.loadImgs(image_id)[0] 49 | 50 | target = self.refexp_gt.loadAnns(ann_ids[0]) 51 | prediction = self.predictions[image_id] 52 | assert prediction is not None 53 | sorted_scores_boxes = sorted( 54 | zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True 55 | ) 56 | sorted_scores, sorted_boxes = zip(*sorted_scores_boxes) 57 | sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes]) 58 | target_bbox = target[0]["bbox"] 59 | converted_bbox = [ 60 | target_bbox[0], 61 | target_bbox[1], 62 | target_bbox[2] + target_bbox[0], 63 | target_bbox[3] + target_bbox[1], 64 | ] 65 | giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4)) 66 | for k in self.k: 67 | if max(giou[:k]) >= self.thresh_iou: 68 | dataset2score[img_info["dataset_name"]][k] += 1.0 69 | dataset2count[img_info["dataset_name"]] += 1.0 70 | 71 | for key, value in dataset2score.items(): 72 | for k in self.k: 73 | try: 74 | value[k] /= dataset2count[key] 75 | except: 76 | pass 77 | results = {} 78 | for key, value in dataset2score.items(): 79 | results[key] = sorted([v for k, v in value.items()]) 80 | print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n") 81 | 82 | return results 83 | return None 84 | 85 | 86 | -------------------------------------------------------------------------------- /eval_davis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | from time import time 5 | import argparse 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from davis2017.evaluation import DAVISEvaluation 10 | 11 | default_davis_path = '../datasets/refer_davis/valid' 12 | 13 | time_start = time() 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--davis_path', type=str, help='Path to the DAVIS folder containing the JPEGImages, Annotations, ' 16 | 'ImageSets, Annotations_unsupervised folders', 17 | required=False, default=default_davis_path) 18 | parser.add_argument('--set', type=str, help='Subset to evaluate the results', default='val') # val subset 19 | parser.add_argument('--task', type=str, help='Task to evaluate the results', default='unsupervised', 20 | choices=['semi-supervised', 'unsupervised']) 21 | parser.add_argument('--results_path', type=str, help='Path to the folder containing the sequences folders', 22 | required=True) 23 | args, _ = parser.parse_known_args() 24 | csv_name_global = f'global_results-{args.set}.csv' 25 | csv_name_per_sequence = f'per-sequence_results-{args.set}.csv' 26 | 27 | # Check if the method has been evaluated before, if so read the results, otherwise compute the results 28 | csv_name_global_path = os.path.join(args.results_path, csv_name_global) 29 | csv_name_per_sequence_path = os.path.join(args.results_path, csv_name_per_sequence) 30 | if os.path.exists(csv_name_global_path) and os.path.exists(csv_name_per_sequence_path): 31 | print('Using precomputed results...') 32 | table_g = pd.read_csv(csv_name_global_path) 33 | table_seq = pd.read_csv(csv_name_per_sequence_path) 34 | else: 35 | print(f'Evaluating sequences for the {args.task} task...') 36 | # Create dataset and evaluate 37 | dataset_eval = DAVISEvaluation(davis_root=args.davis_path, task=args.task, gt_set=args.set) 38 | metrics_res = dataset_eval.evaluate(args.results_path) 39 | J, F = metrics_res['J'], metrics_res['F'] 40 | 41 | # Generate dataframe for the general results 42 | g_measures = ['J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay'] 43 | final_mean = (np.mean(J["M"]) + np.mean(F["M"])) / 2. 44 | g_res = np.array([final_mean, np.mean(J["M"]), np.mean(J["R"]), np.mean(J["D"]), np.mean(F["M"]), np.mean(F["R"]), 45 | np.mean(F["D"])]) 46 | g_res = np.reshape(g_res, [1, len(g_res)]) 47 | table_g = pd.DataFrame(data=g_res, columns=g_measures) 48 | with open(csv_name_global_path, 'w') as f: 49 | table_g.to_csv(f, index=False, float_format="%.5f") 50 | print(f'Global results saved in {csv_name_global_path}') 51 | 52 | # Generate a dataframe for the per sequence results 53 | seq_names = list(J['M_per_object'].keys()) 54 | seq_measures = ['Sequence', 'J-Mean', 'F-Mean'] 55 | J_per_object = [J['M_per_object'][x] for x in seq_names] 56 | F_per_object = [F['M_per_object'][x] for x in seq_names] 57 | table_seq = pd.DataFrame(data=list(zip(seq_names, J_per_object, F_per_object)), columns=seq_measures) 58 | with open(csv_name_per_sequence_path, 'w') as f: 59 | table_seq.to_csv(f, index=False, float_format="%.5f") 60 | print(f'Per-sequence results saved in {csv_name_per_sequence_path}') 61 | 62 | # Print the results 63 | sys.stdout.write(f"--------------------------- Global results for {args.set} ---------------------------\n") 64 | print(table_g.to_string(index=False)) 65 | sys.stdout.write(f"\n---------- Per sequence results for {args.set} ----------\n") 66 | print(table_seq.to_string(index=False)) 67 | total_time = time() - time_start 68 | sys.stdout.write('\nTotal time:' + str(total_time)) 69 | -------------------------------------------------------------------------------- /models/text_encoder/text_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a wrapper for Video-Swin-Transformer so it can be properly used as a temporal encoder for MTTR. 3 | """ 4 | import torch 5 | import os 6 | from torch import nn, Tensor 7 | from einops import rearrange, repeat 8 | 9 | from transformers import RobertaModel, RobertaTokenizerFast 10 | from models.text_encoder.tokenizer import RobertaTokenizer 11 | 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | 15 | 16 | class FeatureResizer(nn.Module): 17 | def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): 18 | super().__init__() 19 | self.do_ln = do_ln 20 | self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True) 21 | self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) 22 | self.dropout = nn.Dropout(dropout) 23 | 24 | def forward(self, encoder_features): 25 | x = self.fc(encoder_features) 26 | if self.do_ln: 27 | x = self.layer_norm(x) 28 | output = self.dropout(x) 29 | return output 30 | 31 | 32 | class TextEncoder(nn.Module): 33 | def __init__(self, args): 34 | super(TextEncoder, self).__init__() 35 | self.args = args 36 | self.hidden_dim = args.hidden_dim 37 | self.text_backbone_name = args.text_backbone 38 | self.token_size = 32 39 | if self.text_backbone_name == "Roberta": 40 | # self.text_backbone = RobertaModel.from_pretrained("roberta-base") 41 | # self.text_backbone.pooler = None # this pooler is never used, this is a hack to avoid DDP problems... 42 | self.tokenizer = RobertaTokenizer() 43 | self.text_backbone = RobertaModel.from_pretrained("checkpoints/roberta-base") 44 | self.feat_dim = 768 45 | else: 46 | assert False, f'error: Text Encoder "{self.text_backbone_name}" is not supported' 47 | 48 | self.freeze_text_encoder = args.freeze_text_encoder 49 | if self.freeze_text_encoder: 50 | # self.text_backbone.eval() 51 | for p in self.text_backbone.parameters(): 52 | p.requires_grad_(False) 53 | for p in self.tokenizer.parameters(): 54 | p.requires_grad_(False) 55 | print("Use {} as text encoder. Freeze: {}".format(self.text_backbone_name, self.freeze_text_encoder)) 56 | 57 | self.target_len = None 58 | 59 | def forward(self, texts, device): 60 | if self.freeze_text_encoder: 61 | with torch.no_grad(): 62 | tokenized_queries = self.tokenizer(texts).to(device) 63 | if self.text_backbone_name == "Roberta": 64 | encoded_text = self.text_backbone(**tokenized_queries) 65 | text_pad_mask = tokenized_queries.attention_mask.ne(1).bool() 66 | text_features = encoded_text.last_hidden_state 67 | text_sentence_features = encoded_text.pooler_output 68 | else: 69 | raise NotImplementedError 70 | else: 71 | tokenized_queries = self.tokenizer(texts).to(device) 72 | if self.text_backbone_name == "Roberta": 73 | encoded_text = self.text_backbone(**tokenized_queries) 74 | text_pad_mask = tokenized_queries.attention_mask.ne(1).bool() 75 | text_features = encoded_text.last_hidden_state 76 | text_sentence_features = encoded_text.pooler_output 77 | else: 78 | raise NotImplementedError 79 | 80 | return text_features, text_sentence_features, text_pad_mask 81 | 82 | def num_parameters(self): 83 | return sum(p.numel() for p in self.parameters() if p.requires_grad) 84 | 85 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def pre_trained_model_to_finetune(checkpoint, args): 6 | checkpoint = checkpoint['model'] 7 | # only delete the class_embed since the finetuned dataset has different num_classes 8 | num_layers = args.dec_layers + 1 if args.two_stage else args.dec_layers 9 | for l in range(num_layers): 10 | del checkpoint["class_embed.{}.weight".format(l)] 11 | del checkpoint["class_embed.{}.bias".format(l)] 12 | 13 | return checkpoint 14 | 15 | 16 | 17 | def colormap(rgb=False): 18 | color_list = np.array( 19 | [ 20 | 0.000, 0.447, 0.741, 21 | 0.850, 0.325, 0.098, 22 | 0.929, 0.694, 0.125, 23 | 0.494, 0.184, 0.556, 24 | 0.466, 0.674, 0.188, 25 | 0.301, 0.745, 0.933, 26 | 0.635, 0.078, 0.184, 27 | 0.300, 0.300, 0.300, 28 | 0.600, 0.600, 0.600, 29 | 1.000, 0.000, 0.000, 30 | 1.000, 0.500, 0.000, 31 | 0.749, 0.749, 0.000, 32 | 0.000, 1.000, 0.000, 33 | 0.000, 0.000, 1.000, 34 | 0.667, 0.000, 1.000, 35 | 0.333, 0.333, 0.000, 36 | 0.333, 0.667, 0.000, 37 | 0.333, 1.000, 0.000, 38 | 0.667, 0.333, 0.000, 39 | 0.667, 0.667, 0.000, 40 | 0.667, 1.000, 0.000, 41 | 1.000, 0.333, 0.000, 42 | 1.000, 0.667, 0.000, 43 | 1.000, 1.000, 0.000, 44 | 0.000, 0.333, 0.500, 45 | 0.000, 0.667, 0.500, 46 | 0.000, 1.000, 0.500, 47 | 0.333, 0.000, 0.500, 48 | 0.333, 0.333, 0.500, 49 | 0.333, 0.667, 0.500, 50 | 0.333, 1.000, 0.500, 51 | 0.667, 0.000, 0.500, 52 | 0.667, 0.333, 0.500, 53 | 0.667, 0.667, 0.500, 54 | 0.667, 1.000, 0.500, 55 | 1.000, 0.000, 0.500, 56 | 1.000, 0.333, 0.500, 57 | 1.000, 0.667, 0.500, 58 | 1.000, 1.000, 0.500, 59 | 0.000, 0.333, 1.000, 60 | 0.000, 0.667, 1.000, 61 | 0.000, 1.000, 1.000, 62 | 0.333, 0.000, 1.000, 63 | 0.333, 0.333, 1.000, 64 | 0.333, 0.667, 1.000, 65 | 0.333, 1.000, 1.000, 66 | 0.667, 0.000, 1.000, 67 | 0.667, 0.333, 1.000, 68 | 0.667, 0.667, 1.000, 69 | 0.667, 1.000, 1.000, 70 | 1.000, 0.000, 1.000, 71 | 1.000, 0.333, 1.000, 72 | 1.000, 0.667, 1.000, 73 | 0.167, 0.000, 0.000, 74 | 0.333, 0.000, 0.000, 75 | 0.500, 0.000, 0.000, 76 | 0.667, 0.000, 0.000, 77 | 0.833, 0.000, 0.000, 78 | 1.000, 0.000, 0.000, 79 | 0.000, 0.167, 0.000, 80 | 0.000, 0.333, 0.000, 81 | 0.000, 0.500, 0.000, 82 | 0.000, 0.667, 0.000, 83 | 0.000, 0.833, 0.000, 84 | 0.000, 1.000, 0.000, 85 | 0.000, 0.000, 0.167, 86 | 0.000, 0.000, 0.333, 87 | 0.000, 0.000, 0.500, 88 | 0.000, 0.000, 0.667, 89 | 0.000, 0.000, 0.833, 90 | 0.000, 0.000, 1.000, 91 | 0.000, 0.000, 0.000, 92 | 0.143, 0.143, 0.143, 93 | 0.286, 0.286, 0.286, 94 | 0.429, 0.429, 0.429, 95 | 0.571, 0.571, 0.571, 96 | 0.714, 0.714, 0.714, 97 | 0.857, 0.857, 0.857, 98 | 1.000, 1.000, 1.000 99 | ] 100 | ).astype(np.float32) 101 | color_list = color_list.reshape((-1, 3)) * 255 102 | if not rgb: 103 | color_list = color_list[:, ::-1] 104 | return color_list -------------------------------------------------------------------------------- /datasets/categories.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------------------------- 2 | # 1. refer_youtube_vos 3 | ytvos_category_dict = { 4 | 'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 5 | 'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 6 | 'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 7 | 'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 8 | 'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 9 | 'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 10 | 'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 11 | 'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64 12 | } 13 | 14 | ytvos_category_list = [ 15 | 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', 16 | 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', 17 | 'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', 18 | 'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', 19 | 'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', 20 | 'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra' 21 | ] 22 | 23 | # ------------------------------------------------------------------------------------------------------------------- 24 | # 2. refer_davis17 25 | davis_category_dict = { 26 | 'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, 27 | 'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, 28 | 'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, 29 | 'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, 30 | 'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, 31 | 'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, 32 | 'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, 33 | 'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, 34 | 'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77 35 | } 36 | 37 | davis_category_list = [ 38 | 'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', 39 | 'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', 40 | 'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', 41 | 'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', 42 | 'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', 43 | 'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', 44 | 'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair' 45 | ] -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /datasets/a2d_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics. 3 | copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py 4 | """ 5 | import torch 6 | from tqdm import tqdm 7 | from pycocotools.coco import COCO 8 | from pycocotools.mask import decode 9 | import numpy as np 10 | 11 | from torchvision.ops.boxes import box_area 12 | 13 | def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor): 14 | # both boxes: xyxy 15 | area1 = box_area(boxes1) 16 | area2 = box_area(boxes2) 17 | 18 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 19 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 20 | 21 | wh = (rb - lt).clamp(min=0) # [N,M,2] 22 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 23 | 24 | union = area1[:, None] + area2 - inter 25 | 26 | iou = (inter+1e-6) / (union+1e-6) 27 | return iou, inter, union 28 | 29 | def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6): 30 | outputs = outputs.int() 31 | intersection = (outputs & labels).float().sum((1, 2)) # Will be zero if Truth=0 or Prediction=0 32 | union = (outputs | labels).float().sum((1, 2)) # Will be zero if both are 0 33 | iou = (intersection + EPS) / (union + EPS) # EPS is used to avoid division by zero 34 | return iou, intersection, union 35 | 36 | # mask 37 | def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): 38 | print('evaluating mask precision@k & iou metrics...') 39 | counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} 40 | total_intersection_area = 0 41 | total_union_area = 0 42 | ious_list = [] 43 | for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance 44 | gt_annot = coco_gt.imgToAnns[instance][0] 45 | gt_mask = decode(gt_annot['segmentation']) 46 | pred_annots = coco_pred.imgToAnns[instance] 47 | pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score 48 | pred_mask = decode(pred_annot['segmentation']) 49 | iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0), 50 | torch.tensor(gt_mask).unsqueeze(0)) 51 | iou, intersection, union = iou.item(), intersection.item(), union.item() 52 | for iou_threshold in counters_by_iou.keys(): 53 | if iou > iou_threshold: 54 | counters_by_iou[iou_threshold] += 1 55 | total_intersection_area += intersection 56 | total_union_area += union 57 | ious_list.append(iou) 58 | num_samples = len(ious_list) 59 | precision_at_k = np.array(list(counters_by_iou.values())) / num_samples 60 | overall_iou = total_intersection_area / total_union_area 61 | mean_iou = np.mean(ious_list) 62 | return precision_at_k, overall_iou, mean_iou 63 | 64 | # bbox 65 | def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): 66 | print('evaluating bbox precision@k & iou metrics...') 67 | counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} 68 | total_intersection_area = 0 69 | total_union_area = 0 70 | ious_list = [] 71 | for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance 72 | gt_annot = coco_gt.imgToAnns[instance][0] 73 | gt_bbox = gt_annot['bbox'] # xywh 74 | gt_bbox = [ 75 | gt_bbox[0], 76 | gt_bbox[1], 77 | gt_bbox[2] + gt_bbox[0], 78 | gt_bbox[3] + gt_bbox[1], 79 | ] 80 | pred_annots = coco_pred.imgToAnns[instance] 81 | pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score 82 | pred_bbox = pred_annot['bbox'] # xyxy 83 | iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0), 84 | torch.tensor(gt_bbox).unsqueeze(0)) 85 | iou, intersection, union = iou.item(), intersection.item(), union.item() 86 | for iou_threshold in counters_by_iou.keys(): 87 | if iou > iou_threshold: 88 | counters_by_iou[iou_threshold] += 1 89 | total_intersection_area += intersection 90 | total_union_area += union 91 | ious_list.append(iou) 92 | num_samples = len(ious_list) 93 | precision_at_k = np.array(list(counters_by_iou.values())) / num_samples 94 | overall_iou = total_intersection_area / total_union_area 95 | mean_iou = np.mean(ious_list) 96 | return precision_at_k, overall_iou, mean_iou 97 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for bounding box manipulation and GIoU. 3 | """ 4 | import torch 5 | from torchvision.ops.boxes import box_area 6 | 7 | def clip_iou(boxes1,boxes2): 8 | area1 = box_area(boxes1) 9 | area2 = box_area(boxes2) 10 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) 11 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) 12 | wh = (rb - lt).clamp(min=0) 13 | inter = wh[:,0] * wh[:,1] 14 | union = area1 + area2 - inter 15 | iou = (inter + 1e-6) / (union+1e-6) 16 | return iou 17 | 18 | def multi_iou(boxes1, boxes2): 19 | lt = torch.max(boxes1[...,:2], boxes2[...,:2]) 20 | rb = torch.min(boxes1[...,2:], boxes2[...,2:]) 21 | wh = (rb - lt).clamp(min=0) 22 | wh_1 = boxes1[...,2:] - boxes1[...,:2] 23 | wh_2 = boxes2[...,2:] - boxes2[...,:2] 24 | inter = wh[...,0] * wh[...,1] 25 | union = wh_1[...,0] * wh_1[...,1] + wh_2[...,0] * wh_2[...,1] - inter 26 | iou = (inter + 1e-6) / (union + 1e-6) 27 | return iou 28 | 29 | def box_cxcywh_to_xyxy(x): 30 | x_c, y_c, w, h = x.unbind(-1) 31 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 32 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 33 | return torch.stack(b, dim=-1) 34 | 35 | 36 | def box_xyxy_to_cxcywh(x): 37 | x0, y0, x1, y1 = x.unbind(-1) 38 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 39 | (x1 - x0), (y1 - y0)] 40 | return torch.stack(b, dim=-1) 41 | 42 | 43 | # modified from torchvision to also return the union 44 | def box_iou(boxes1, boxes2): 45 | area1 = box_area(boxes1) 46 | area2 = box_area(boxes2) 47 | 48 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 49 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 50 | 51 | wh = (rb - lt).clamp(min=0) # [N,M,2] 52 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 53 | 54 | union = area1[:, None] + area2 - inter 55 | 56 | iou = (inter+1e-6) / (union+1e-6) 57 | return iou, union 58 | 59 | 60 | def generalized_box_iou(boxes1, boxes2): 61 | """ 62 | Generalized IoU from https://giou.stanford.edu/ 63 | 64 | The boxes should be in [x0, y0, x1, y1] format 65 | 66 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 67 | and M = len(boxes2) 68 | """ 69 | # degenerate boxes gives inf / nan results 70 | # so do an early check 71 | # if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): 72 | # for i in range(boxes1.shape[0]): 73 | # if not (boxes1[i, 2:] >= boxes1[i, :2]).all(): 74 | # boxes1[i] = torch.zeros_like(boxes1[i]) 75 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all(), "error boxes: {} vs {}.".format(boxes1, boxes2) 76 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all(), "error boxes: {} vs {}.".format(boxes1, boxes2) 77 | iou, union = box_iou(boxes1, boxes2) 78 | 79 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 80 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 81 | 82 | wh = (rb - lt).clamp(min=0) # [N,M,2] 83 | area = wh[:, :, 0] * wh[:, :, 1] 84 | 85 | return iou - ((area - union) + 1e-6) / (area + 1e-6) 86 | 87 | 88 | def masks_to_boxes(masks): 89 | """Compute the bounding boxes around the provided masks 90 | 91 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 92 | 93 | Returns a [N, 4] tensors, with the boxes in xyxy format 94 | """ 95 | if masks.numel() == 0: 96 | return torch.zeros((0, 4), device=masks.device) 97 | 98 | h, w = masks.shape[-2:] 99 | 100 | y = torch.arange(0, h, dtype=torch.float) 101 | x = torch.arange(0, w, dtype=torch.float) 102 | y, x = torch.meshgrid(y, x) 103 | 104 | x_mask = (masks * x.unsqueeze(0)) 105 | x_max = x_mask.flatten(1).max(-1)[0] 106 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 107 | 108 | y_mask = (masks * y.unsqueeze(0)) 109 | y_max = y_mask.flatten(1).max(-1)[0] 110 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 111 | 112 | return torch.stack([x_min, y_min, x_max, y_max], 1) 113 | 114 | def center_of_mass(bitmasks): 115 | _, h, w = bitmasks.size() 116 | 117 | ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device) 118 | xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device) 119 | 120 | m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6) 121 | m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1) 122 | m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1) 123 | center_x = m10 / m00 124 | center_y = m01 / m00 125 | return center_x, center_y 126 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dumps things to tensorboard and console 3 | """ 4 | 5 | import os 6 | import warnings 7 | # import git 8 | 9 | import torchvision.transforms as transforms 10 | from torch.utils.tensorboard import SummaryWriter 11 | 12 | 13 | def tensor_to_numpy(image): 14 | image_np = (image.numpy() * 255).astype('uint8') 15 | return image_np 16 | 17 | def detach_to_cpu(x): 18 | return x.detach().cpu() 19 | 20 | def fix_width_trunc(x): 21 | return ('{:.9s}'.format('{:0.9f}'.format(x))) 22 | 23 | class TensorboardLogger: 24 | def __init__(self, short_id, id, local_rank): 25 | self.short_id = short_id 26 | if self.short_id == 'NULL': 27 | self.short_id = 'DEBUG' 28 | 29 | if id is None: 30 | self.no_log = True 31 | warnings.warn('Logging has been disbaled.') 32 | else: 33 | self.no_log = False 34 | 35 | self.inv_im_trans = transforms.Normalize( 36 | mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 37 | std=[1/0.229, 1/0.224, 1/0.225]) 38 | 39 | self.inv_seg_trans = transforms.Normalize( 40 | mean=[-0.5/0.5], 41 | std=[1/0.5]) 42 | 43 | log_path = os.path.join('..', 'log', '%s' % id) 44 | os.makedirs(log_path, exist_ok=True) 45 | self.logger = SummaryWriter(log_path) 46 | 47 | self.local_rank = local_rank 48 | self.values = {} 49 | self.counts = {} 50 | 51 | def log_scalar(self, tag, x, step): 52 | if self.no_log: 53 | warnings.warn('Logging has been disabled.') 54 | return 55 | self.logger.add_scalar(tag, x, step) 56 | 57 | def log_metrics(self, l1_tag, l2_tag, val, step, f=None): 58 | tag = l1_tag + '/' + l2_tag 59 | text = '{:s} - It {:6d} [{:5s}] [{:13}]: {:s}'.format(self.short_id, step, l1_tag.upper(), l2_tag, fix_width_trunc(val)) 60 | if f is not None: 61 | f.write(text + '\n') 62 | f.flush() 63 | self.log_scalar(tag, val, step) 64 | 65 | def log_im(self, tag, x, step): 66 | if self.no_log: 67 | warnings.warn('Logging has been disabled.') 68 | return 69 | x = detach_to_cpu(x) 70 | x = self.inv_im_trans(x) 71 | x = tensor_to_numpy(x) 72 | self.logger.add_image(tag, x, step) 73 | 74 | def log_cv2(self, tag, x, step): 75 | if self.no_log: 76 | warnings.warn('Logging has been disabled.') 77 | return 78 | x = x.transpose((2, 0, 1)) 79 | self.logger.add_image(tag, x, step) 80 | 81 | def log_seg(self, tag, x, step): 82 | if self.no_log: 83 | warnings.warn('Logging has been disabled.') 84 | return 85 | x = detach_to_cpu(x) 86 | x = self.inv_seg_trans(x) 87 | x = tensor_to_numpy(x) 88 | self.logger.add_image(tag, x, step) 89 | 90 | def log_gray(self, tag, x, step): 91 | if self.no_log: 92 | warnings.warn('Logging has been disabled.') 93 | return 94 | x = detach_to_cpu(x) 95 | x = tensor_to_numpy(x) 96 | self.logger.add_image(tag, x, step) 97 | 98 | def log_string(self, tag, x): 99 | print(tag, x) 100 | if self.no_log: 101 | warnings.warn('Logging has been disabled.') 102 | return 103 | self.logger.add_text(tag, x) 104 | 105 | def add_dict(self, tensor_dict, itr): 106 | for k, v in tensor_dict.items(): 107 | self.add_tensor(k, v, itr) 108 | 109 | def add_tensor(self, key, tensor, itr): 110 | if len(key.split("_")) == 3: 111 | self.log_scalar("sublayer_loss/" + key, tensor, itr) 112 | else: 113 | self.log_scalar("main_loss/" + key, tensor, itr) 114 | 115 | 116 | # def add_tensor(self, key, tensor, itr): 117 | # if key not in self.values: 118 | # self.counts[key] = 1 119 | # if type(tensor) == float or type(tensor) == int: 120 | # self.values[key] = tensor 121 | # else: 122 | # self.values[key] = tensor.mean().item() 123 | # else: 124 | # self.counts[key] += 1 125 | # if type(tensor) == float or type(tensor) == int: 126 | # self.values[key] += tensor 127 | # else: 128 | # self.values[key] += tensor.mean().item() 129 | # 130 | # for k, v in self.values.items(): 131 | # if len(k.split("_")) == 3: 132 | # self.log_scalar("sublayer_loss/" + k, v, itr) 133 | # else: 134 | # self.log_scalar("main_loss/"+k, v, itr) -------------------------------------------------------------------------------- /datasets/image_to_seq_augmenter.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from SeqFormer (https://github.com/wjf5203/SeqFormer) 3 | # ------------------------------------------------------------------------ 4 | # Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg) 5 | # ------------------------------------------------------------------------ 6 | 7 | 8 | import imgaug 9 | import imgaug.augmenters as iaa 10 | import numpy as np 11 | 12 | from datetime import datetime 13 | 14 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage 15 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage 16 | 17 | 18 | class ImageToSeqAugmenter(object): 19 | def __init__(self, perspective=True, affine=True, motion_blur=True, 20 | brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12, 21 | scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20), 22 | motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5): 23 | 24 | self.basic_augmenter = iaa.SomeOf((1, None), [ 25 | iaa.Add(brightness_range), 26 | iaa.AddToHueAndSaturation(hue_saturation_range) 27 | ] 28 | ) 29 | 30 | transforms = [] 31 | if perspective: 32 | transforms.append(iaa.PerspectiveTransform(perspective_magnitude)) 33 | if affine: 34 | transforms.append(iaa.Affine(scale=scale_range, 35 | translate_percent=translate_range, 36 | rotate=rotation_range, 37 | order=1, # cv2.INTER_LINEAR 38 | backend='auto')) 39 | transforms = iaa.Sequential(transforms) 40 | transforms = [transforms] 41 | 42 | if motion_blur: 43 | blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf( 44 | [ 45 | iaa.MotionBlur(ksize) 46 | for ksize in motion_blur_kernel_sizes 47 | ] 48 | )) 49 | transforms.append(blur) 50 | 51 | self.frame_shift_augmenter = iaa.Sequential(transforms) 52 | 53 | @staticmethod 54 | def condense_masks(instance_masks): 55 | condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8) 56 | for instance_id, mask in enumerate(instance_masks, 1): 57 | condensed_mask = np.where(mask, instance_id, condensed_mask) 58 | 59 | return condensed_mask 60 | 61 | @staticmethod 62 | def expand_masks(condensed_mask, num_instances): 63 | return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)] 64 | 65 | def __call__(self, image, masks=None, boxes=None): 66 | det_augmenter = self.frame_shift_augmenter.to_deterministic() 67 | 68 | 69 | if masks is not None: 70 | masks_np, is_binary_mask = [], [] 71 | boxs_np = [] 72 | 73 | for mask in masks: 74 | 75 | if isinstance(mask, np.ndarray): 76 | masks_np.append(mask.astype(np.bool)) 77 | is_binary_mask.append(False) 78 | else: 79 | raise ValueError("Invalid mask type: {}".format(type(mask))) 80 | 81 | num_instances = len(masks_np) 82 | masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2]) 83 | # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2]) 84 | 85 | seed = int(datetime.now().strftime('%M%S%f')[-8:]) 86 | imgaug.seed(seed) 87 | aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np) 88 | imgaug.seed(seed) 89 | # invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2) 90 | aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances) 91 | # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image() 92 | aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)] 93 | # (427, 640, 3) (427, 640) 94 | return aug_image, aug_masks #, aug_boxes.to_xyxy_array() 95 | 96 | else: 97 | # if no mask is provided, random generate and delete the mask. 98 | masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])] 99 | aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks) 100 | return aug_image, invalid_pts_mask.get_arr() == 0 101 | -------------------------------------------------------------------------------- /models/backbone.py: -------------------------------------------------------------------------------- 1 | """ 2 | Backbone modules. 3 | Modified from DETR (https://github.com/facebookresearch/detr) 4 | """ 5 | from collections import OrderedDict 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import torchvision 10 | from torch import nn 11 | from torchvision.models._utils import IntermediateLayerGetter 12 | from typing import Dict, List 13 | from einops import rearrange 14 | 15 | from util.misc import NestedTensor, is_main_process 16 | 17 | from .position_encoding import build_position_encoding 18 | 19 | 20 | class FrozenBatchNorm2d(torch.nn.Module): 21 | """ 22 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 23 | 24 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 25 | without which any other models than torchvision.models.resnet[18,34,50,101] 26 | produce nans. 27 | """ 28 | 29 | def __init__(self, n): 30 | super(FrozenBatchNorm2d, self).__init__() 31 | self.register_buffer("weight", torch.ones(n)) 32 | self.register_buffer("bias", torch.zeros(n)) 33 | self.register_buffer("running_mean", torch.zeros(n)) 34 | self.register_buffer("running_var", torch.ones(n)) 35 | 36 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 37 | missing_keys, unexpected_keys, error_msgs): 38 | num_batches_tracked_key = prefix + 'num_batches_tracked' 39 | if num_batches_tracked_key in state_dict: 40 | del state_dict[num_batches_tracked_key] 41 | 42 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 43 | state_dict, prefix, local_metadata, strict, 44 | missing_keys, unexpected_keys, error_msgs) 45 | 46 | def forward(self, x): 47 | # move reshapes to the beginning 48 | # to make it fuser-friendly 49 | w = self.weight.reshape(1, -1, 1, 1) 50 | b = self.bias.reshape(1, -1, 1, 1) 51 | rv = self.running_var.reshape(1, -1, 1, 1) 52 | rm = self.running_mean.reshape(1, -1, 1, 1) 53 | eps = 1e-5 54 | scale = w * (rv + eps).rsqrt() 55 | bias = b - rm * scale 56 | return x * scale + bias 57 | 58 | 59 | class BackboneBase(nn.Module): 60 | 61 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): 62 | super().__init__() 63 | for name, parameter in backbone.named_parameters(): 64 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 65 | parameter.requires_grad_(False) 66 | if return_interm_layers: 67 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 68 | # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} deformable detr 69 | self.strides = [4, 8, 16, 32] 70 | self.num_channels = [256, 512, 1024, 2048] 71 | else: 72 | return_layers = {'layer4': "0"} 73 | self.strides = [32] 74 | self.num_channels = [2048] 75 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 76 | 77 | def forward(self, tensor_list: NestedTensor): 78 | xs = self.body(tensor_list.tensors) 79 | out: Dict[str, NestedTensor] = {} 80 | for name, x in xs.items(): 81 | m = tensor_list.mask 82 | assert m is not None 83 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 84 | out[name] = NestedTensor(x, mask) 85 | return out 86 | 87 | 88 | class Backbone(BackboneBase): 89 | """ResNet backbone with frozen BatchNorm.""" 90 | def __init__(self, name: str, 91 | train_backbone: bool, 92 | return_interm_layers: bool, 93 | dilation: bool): # True 94 | backbone = getattr(torchvision.models, name)( 95 | replace_stride_with_dilation=[False, False, dilation], 96 | pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) 97 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded" 98 | super().__init__(backbone, train_backbone, return_interm_layers) 99 | if dilation: 100 | self.strides[-1] = self.strides[-1] // 2 101 | 102 | 103 | class Joiner(nn.Sequential): 104 | def __init__(self, backbone, position_embedding): 105 | super().__init__(backbone, position_embedding) 106 | self.strides = backbone.strides 107 | self.num_channels = backbone.num_channels 108 | 109 | 110 | def forward(self, tensor_list: NestedTensor): 111 | tensor_list.tensors = rearrange(tensor_list.tensors, 'b t c h w -> (b t) c h w') 112 | tensor_list.mask = rearrange(tensor_list.mask, 'b t h w -> (b t) h w') 113 | 114 | xs = self[0](tensor_list) 115 | out: List[NestedTensor] = [] 116 | pos = [] 117 | for name, x in xs.items(): 118 | out.append(x) 119 | # position encoding 120 | pos.append(self[1](x).to(x.tensors.dtype)) 121 | return out, pos 122 | 123 | 124 | def build_backbone(args): 125 | position_embedding = build_position_encoding(args) 126 | train_backbone = args.lr_backbone > 0 127 | return_interm_layers = args.masks or (args.num) 128 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) 129 | model = Joiner(backbone, position_embedding) 130 | model.num_channels = backbone.num_channels 131 | return model 132 | 133 | -------------------------------------------------------------------------------- /pre_process/generate_anno_ytvos.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cv2 4 | import torch 5 | import torch.nn.functional as F 6 | import h5py 7 | 8 | from data import img_transform, load_img_davis, load_video_a2d 9 | from sim_model import SimModel 10 | import numpy as np 11 | from torchvision import transforms 12 | from tqdm import tqdm 13 | import json 14 | from PIL import Image 15 | 16 | transform = transforms.Compose( 17 | [ 18 | transforms.ToTensor(), 19 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 20 | ] 21 | ) 22 | 23 | def bounding_box(img): 24 | rows = np.any(img, axis=1) 25 | cols = np.any(img, axis=0) 26 | rmin, rmax = np.where(rows)[0][[0, -1]] 27 | cmin, cmax = np.where(cols)[0][[0, -1]] 28 | return rmin, rmax, cmin, cmax # y1, y2, x1, x2 29 | 30 | def transform_anno_to_each_frame(meta_path, exp_meta_path): 31 | anno_dict = json.load(open(meta_path)) 32 | exp_dict = json.load(open(exp_meta_path)) 33 | # annos = anno_dict['videos'] 34 | annos = exp_dict['videos'] 35 | annos_out = {} 36 | for vid in annos.keys(): 37 | if vid not in annos_out.keys(): 38 | annos_out[vid] = {} 39 | obj_ids = [] 40 | for exp_info_id in annos[vid]['expressions'].keys(): 41 | obj_id = annos[vid]['expressions'][exp_info_id]['obj_id'] 42 | if obj_id not in obj_ids: 43 | obj_ids.append(obj_id) 44 | for frame_id in annos[vid]['frames']: 45 | if frame_id not in annos_out[vid].keys(): 46 | annos_out[vid][frame_id] = [] 47 | annos_out[vid][frame_id] = obj_ids 48 | return annos_out 49 | 50 | @ torch.no_grad() 51 | def generate_mask(anno_dict, video_path, anno_path, save_path, model, cuda=True): 52 | for vid in tqdm(anno_dict.keys()): 53 | video_save_path = os.path.join(save_path, vid) 54 | if not os.path.exists(video_save_path): 55 | os.makedirs(video_save_path) 56 | for frame_id in anno_dict[vid].keys(): 57 | if not os.path.exists(os.path.join(video_save_path, "{}.h5".format(frame_id))): 58 | obj_ids = anno_dict[vid][frame_id] 59 | frame = Image.open(os.path.join(video_path, vid, frame_id+'.jpg')).convert('RGB') 60 | mask = Image.open(os.path.join(anno_path, vid, frame_id+'.png')).convert('P') 61 | frame = transform(frame) 62 | mask = np.array(mask) 63 | h, w = mask.shape 64 | 65 | centers = [] 66 | bboxes = [] 67 | centers_norm = [] 68 | instance_valid = [] 69 | obj_ids = [int(id) for id in obj_ids] 70 | for obj_id in obj_ids: 71 | mask_cur = ((mask==obj_id) * 255).astype(np.uint8) 72 | if (mask_cur > 0).any(): 73 | dist = cv2.distanceTransform( 74 | mask_cur, cv2.DIST_L2, 5, cv2.DIST_LABEL_PIXEL 75 | ) 76 | _, _, _, center = cv2.minMaxLoc(dist) 77 | center_norm = (center[0] / w, center[1] / h) 78 | y1, y2, x1, x2 = bounding_box(mask_cur) 79 | bbox = np.array([x1, y1, x2, y2]) 80 | bbox[0::2] = np.clip(bbox[0::2], 0, w) 81 | bbox[1::2] = np.clip(bbox[1::2], 0, h) 82 | bboxes.append(bbox) 83 | centers.append(center) 84 | centers_norm.append(center_norm) 85 | instance_valid.append(1) 86 | else: 87 | bboxes.append(np.array([0, 0, 0, 0])) 88 | centers.append([0, 0]) 89 | centers_norm.append([0, 0]) 90 | instance_valid.append(0) 91 | if cuda: 92 | frame = frame.cuda() 93 | 94 | masks_point = model(frame[None], centers_norm, instance_valid, "point") 95 | masks_bbox = model(frame[None], bboxes, instance_valid, "bbox") 96 | masks_point = masks_point[0].cpu().numpy() 97 | masks_bbox = masks_bbox[0].cpu().numpy() 98 | out_annos = h5py.File( 99 | os.path.join(video_save_path, "{}.h5".format(frame_id)), "w" 100 | ) 101 | out_annos.create_dataset("obj_ids", data=obj_ids) 102 | out_annos.create_dataset("heatBBox", data=masks_bbox) 103 | out_annos.create_dataset("heatPoint", data=masks_point) 104 | out_annos.create_dataset("centerPoint", data=centers) 105 | out_annos.close() 106 | 107 | 108 | 109 | if __name__ == "__main__": 110 | video_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/JPEGImages/" 111 | anno_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/Annotations/" 112 | save_path = "./anno_weak/ref-youtube-vos/train/AnnotationsWeakly/" 113 | meta_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/meta.json" 114 | exp_meta_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/meta_expressions/train/meta_expressions.json" 115 | dilation = False 116 | cuda = True 117 | 118 | model = SimModel("resnet101", dilation) 119 | if cuda: 120 | model.cuda() 121 | 122 | annos_by_frame = transform_anno_to_each_frame(meta_path, exp_meta_path) 123 | generate_mask(annos_by_frame, video_path, anno_path, save_path, model, cuda) 124 | -------------------------------------------------------------------------------- /datasets/coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | COCO dataset which returns image_id for evaluation. 3 | 4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 5 | """ 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.utils.data 10 | import torchvision 11 | from pycocotools import mask as coco_mask 12 | 13 | import datasets.transforms as T 14 | 15 | 16 | class CocoDetection(torchvision.datasets.CocoDetection): 17 | def __init__(self, img_folder, ann_file, transforms, return_masks): 18 | super(CocoDetection, self).__init__(img_folder, ann_file) 19 | self._transforms = transforms 20 | self.prepare = ConvertCocoPolysToMask(return_masks) 21 | 22 | def __getitem__(self, idx): 23 | img, target = super(CocoDetection, self).__getitem__(idx) 24 | image_id = self.ids[idx] 25 | target = {'image_id': image_id, 'annotations': target} 26 | 27 | img, target = self.prepare(img, target) 28 | if self._transforms is not None: 29 | img, target = self._transforms(img, target) 30 | return img, target 31 | 32 | 33 | def convert_coco_poly_to_mask(segmentations, height, width): 34 | masks = [] 35 | for polygons in segmentations: 36 | rles = coco_mask.frPyObjects(polygons, height, width) 37 | mask = coco_mask.decode(rles) 38 | if len(mask.shape) < 3: 39 | mask = mask[..., None] 40 | mask = torch.as_tensor(mask, dtype=torch.uint8) 41 | mask = mask.any(dim=2) 42 | masks.append(mask) 43 | if masks: 44 | masks = torch.stack(masks, dim=0) 45 | else: 46 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 47 | return masks 48 | 49 | 50 | class ConvertCocoPolysToMask(object): 51 | def __init__(self, return_masks=False): 52 | self.return_masks = return_masks 53 | 54 | def __call__(self, image, target): 55 | w, h = image.size 56 | 57 | image_id = target["image_id"] 58 | image_id = torch.tensor([image_id]) 59 | 60 | anno = target["annotations"] 61 | 62 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 63 | 64 | boxes = [obj["bbox"] for obj in anno] 65 | # guard against no boxes via resizing 66 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 67 | boxes[:, 2:] += boxes[:, :2] 68 | boxes[:, 0::2].clamp_(min=0, max=w) 69 | boxes[:, 1::2].clamp_(min=0, max=h) 70 | 71 | classes = [obj["category_id"] for obj in anno] 72 | classes = torch.tensor(classes, dtype=torch.int64) 73 | 74 | if self.return_masks: 75 | segmentations = [obj["segmentation"] for obj in anno] 76 | masks = convert_coco_poly_to_mask(segmentations, h, w) 77 | 78 | keypoints = None 79 | if anno and "keypoints" in anno[0]: 80 | keypoints = [obj["keypoints"] for obj in anno] 81 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 82 | num_keypoints = keypoints.shape[0] 83 | if num_keypoints: 84 | keypoints = keypoints.view(num_keypoints, -1, 3) 85 | 86 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 87 | boxes = boxes[keep] 88 | classes = classes[keep] 89 | if self.return_masks: 90 | masks = masks[keep] 91 | if keypoints is not None: 92 | keypoints = keypoints[keep] 93 | 94 | target = {} 95 | target["boxes"] = boxes 96 | target["labels"] = classes 97 | if self.return_masks: 98 | target["masks"] = masks 99 | target["image_id"] = image_id 100 | if keypoints is not None: 101 | target["keypoints"] = keypoints 102 | 103 | # for conversion to coco api 104 | area = torch.tensor([obj["area"] for obj in anno]) 105 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 106 | target["area"] = area[keep] 107 | target["iscrowd"] = iscrowd[keep] 108 | 109 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 110 | target["size"] = torch.as_tensor([int(h), int(w)]) 111 | 112 | return image, target 113 | 114 | 115 | def make_coco_transforms(image_set): 116 | 117 | normalize = T.Compose([ 118 | T.ToTensor(), 119 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 120 | ]) 121 | 122 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] 123 | 124 | if image_set == 'train': 125 | return T.Compose([ 126 | T.RandomHorizontalFlip(), 127 | T.RandomSelect( 128 | T.RandomResize(scales, max_size=1333), 129 | T.Compose([ 130 | T.RandomResize([400, 500, 600]), 131 | T.RandomSizeCrop(384, 600), 132 | T.RandomResize(scales, max_size=1333), 133 | ]) 134 | ), 135 | normalize, 136 | ]) 137 | 138 | if image_set == 'val': 139 | return T.Compose([ 140 | T.RandomResize([800], max_size=1333), 141 | normalize, 142 | ]) 143 | 144 | raise ValueError(f'unknown {image_set}') 145 | 146 | 147 | def build(image_set, args): 148 | root = Path(args.coco_path) 149 | assert root.exists(), f'provided COCO path {root} does not exist' 150 | mode = 'instances' 151 | PATHS = { 152 | "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), 153 | "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), 154 | } 155 | img_folder, ann_file = PATHS[image_set] 156 | dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks) 157 | return dataset 158 | -------------------------------------------------------------------------------- /davis2017/davis.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from collections import defaultdict 4 | import numpy as np 5 | from PIL import Image 6 | 7 | 8 | class DAVIS(object): 9 | SUBSET_OPTIONS = ['train', 'val', 'test-dev', 'test-challenge'] 10 | TASKS = ['semi-supervised', 'unsupervised'] 11 | DATASET_WEB = 'https://davischallenge.org/davis2017/code.html' 12 | VOID_LABEL = 255 13 | 14 | def __init__(self, root, task='unsupervised', subset='val', sequences='all', resolution='480p', codalab=False): 15 | """ 16 | Class to read the DAVIS dataset 17 | :param root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders. 18 | :param task: Task to load the annotations, choose between semi-supervised or unsupervised. 19 | :param subset: Set to load the annotations 20 | :param sequences: Sequences to consider, 'all' to use all the sequences in a set. 21 | :param resolution: Specify the resolution to use the dataset, choose between '480' and 'Full-Resolution' 22 | """ 23 | if subset not in self.SUBSET_OPTIONS: 24 | raise ValueError(f'Subset should be in {self.SUBSET_OPTIONS}') 25 | if task not in self.TASKS: 26 | raise ValueError(f'The only tasks that are supported are {self.TASKS}') 27 | 28 | self.task = task 29 | self.subset = subset 30 | self.root = root 31 | self.img_path = os.path.join(self.root, 'JPEGImages', resolution) 32 | annotations_folder = 'Annotations' if task == 'semi-supervised' else 'Annotations_unsupervised' 33 | self.mask_path = os.path.join(self.root, annotations_folder, resolution) 34 | year = '2019' if task == 'unsupervised' and (subset == 'test-dev' or subset == 'test-challenge') else '2017' 35 | self.imagesets_path = os.path.join(self.root, 'ImageSets', year) 36 | 37 | self._check_directories() 38 | 39 | if sequences == 'all': 40 | with open(os.path.join(self.imagesets_path, f'{self.subset}.txt'), 'r') as f: 41 | tmp = f.readlines() 42 | sequences_names = [x.strip() for x in tmp] 43 | else: 44 | sequences_names = sequences if isinstance(sequences, list) else [sequences] 45 | self.sequences = defaultdict(dict) 46 | 47 | for seq in sequences_names: 48 | images = np.sort(glob(os.path.join(self.img_path, seq, '*.jpg'))).tolist() 49 | if len(images) == 0 and not codalab: 50 | raise FileNotFoundError(f'Images for sequence {seq} not found.') 51 | self.sequences[seq]['images'] = images 52 | masks = np.sort(glob(os.path.join(self.mask_path, seq, '*.png'))).tolist() 53 | masks.extend([-1] * (len(images) - len(masks))) 54 | self.sequences[seq]['masks'] = masks 55 | 56 | def _check_directories(self): 57 | if not os.path.exists(self.root): 58 | raise FileNotFoundError(f'DAVIS not found in the specified directory, download it from {self.DATASET_WEB}') 59 | if not os.path.exists(os.path.join(self.imagesets_path, f'{self.subset}.txt')): 60 | raise FileNotFoundError(f'Subset sequences list for {self.subset} not found, download the missing subset ' 61 | f'for the {self.task} task from {self.DATASET_WEB}') 62 | if self.subset in ['train', 'val'] and not os.path.exists(self.mask_path): 63 | raise FileNotFoundError(f'Annotations folder for the {self.task} task not found, download it from {self.DATASET_WEB}') 64 | 65 | def get_frames(self, sequence): 66 | for img, msk in zip(self.sequences[sequence]['images'], self.sequences[sequence]['masks']): 67 | image = np.array(Image.open(img)) 68 | mask = None if msk is None else np.array(Image.open(msk)) 69 | yield image, mask 70 | 71 | def _get_all_elements(self, sequence, obj_type): 72 | obj = np.array(Image.open(self.sequences[sequence][obj_type][0])) 73 | all_objs = np.zeros((len(self.sequences[sequence][obj_type]), *obj.shape)) 74 | obj_id = [] 75 | for i, obj in enumerate(self.sequences[sequence][obj_type]): 76 | all_objs[i, ...] = np.array(Image.open(obj)) 77 | obj_id.append(''.join(obj.split('/')[-1].split('.')[:-1])) 78 | return all_objs, obj_id 79 | 80 | def get_all_images(self, sequence): 81 | return self._get_all_elements(sequence, 'images') 82 | 83 | def get_all_masks(self, sequence, separate_objects_masks=False): 84 | masks, masks_id = self._get_all_elements(sequence, 'masks') 85 | masks_void = np.zeros_like(masks) 86 | 87 | # Separate void and object masks 88 | for i in range(masks.shape[0]): 89 | masks_void[i, ...] = masks[i, ...] == 255 90 | masks[i, masks[i, ...] == 255] = 0 91 | 92 | if separate_objects_masks: 93 | num_objects = int(np.max(masks[0, ...])) 94 | tmp = np.ones((num_objects, *masks.shape)) 95 | tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None] 96 | masks = (tmp == masks[None, ...]) 97 | masks = masks > 0 98 | return masks, masks_void, masks_id 99 | 100 | def get_sequences(self): 101 | for seq in self.sequences: 102 | yield seq 103 | 104 | 105 | if __name__ == '__main__': 106 | from matplotlib import pyplot as plt 107 | 108 | only_first_frame = True 109 | subsets = ['train', 'val'] 110 | 111 | for s in subsets: 112 | dataset = DAVIS(root='/home/csergi/scratch2/Databases/DAVIS2017_private', subset=s) 113 | for seq in dataset.get_sequences(): 114 | g = dataset.get_frames(seq) 115 | img, mask = next(g) 116 | plt.subplot(2, 1, 1) 117 | plt.title(seq) 118 | plt.imshow(img) 119 | plt.subplot(2, 1, 2) 120 | plt.imshow(mask) 121 | plt.show(block=True) 122 | 123 | -------------------------------------------------------------------------------- /util/visualization.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def generate_ce_weight(heatmap, size, box=None, alpha=0.7, beta=0.3, thres=0.5): 7 | weight = heatmap 8 | weight[weight>alpha] = alpha 9 | weight[weight= alpha).astype(float) 14 | # background = (heatmap <= beta).astype(float) 15 | # uncertain = np.logical_and(heatmap > beta, heatmap < alpha).astype(float) 16 | # uncertain_weight = np.abs(heatmap - thres) + 0.5 17 | if box is not None: 18 | box_regions = np.zeros_like(heatmap) 19 | h, w = size 20 | box = np.array([box[0] - box[2] / 2, box[1] - box[3] / 2, box[0] + box[2] / 2, box[1] + box[3] / 2]) 21 | boxes_scale = (box * np.array([w, h, w, h])).astype(int) 22 | box_regions[ 23 | boxes_scale[1] : boxes_scale[3], boxes_scale[0] : boxes_scale[2] 24 | ] = 1 25 | weight[box_regions==0] = 1 26 | # background[np.where(box_regions == 0)] = 1 27 | # foreground[np.where(box_regions == 0)] = 0 28 | # uncertain[np.where(box_regions == 0)] = 0 29 | # weight = foreground * 1 + background * 1 + uncertain_weight * uncertain 30 | # weight = box_regions 31 | return weight 32 | 33 | 34 | def generate_mask_from_heatmap(heatmap, thres=0.5): 35 | background = np.ones((1, heatmap.shape[-2], heatmap.shape[-1])) * thres 36 | masks_with_bg = np.concatenate([background, heatmap]) 37 | masks = np.zeros_like(masks_with_bg) 38 | max_idx = np.argmax(masks_with_bg, axis=0) 39 | for i in range(masks.shape[0]): 40 | masks[i, max_idx == i] = 1 41 | return masks[1:] 42 | 43 | 44 | def viz_heatmap(heatmap, rgb_img=None): 45 | # heatmap: [h, w] \in [0,1] rgb_img: [h, w, 3] np.array 46 | out_img = rgb_img.copy() 47 | heatmap = (heatmap * 255).astype(np.uint8) 48 | heatmap = cv2.applyColorMap(heatmap, 11) 49 | out_img = cv2.addWeighted(out_img, 1, heatmap, 0.6, 1) 50 | return out_img 51 | 52 | 53 | def viz_bbox(bbox, size, rgb_img): 54 | # bbox: [4] \in [0,1] rgb_img: [h, w, 3] np.array 55 | out_img = rgb_img.copy() 56 | h, w = size 57 | x_c, y_c, bw, bh = bbox 58 | bbox_xyxy = np.array( 59 | [(x_c - 0.5 * bw), (y_c - 0.5 * bh), (x_c + 0.5 * bw), (y_c + 0.5 * bh)] 60 | ) 61 | bbox_scale = (bbox_xyxy * np.array([w, h, w, h])).astype(int) 62 | out_img = cv2.rectangle( 63 | out_img, 64 | (bbox_scale[0], bbox_scale[1]), 65 | (bbox_scale[2], bbox_scale[3]), 66 | (255, 0, 0), 67 | 3, 68 | ) 69 | return out_img 70 | 71 | 72 | def viz_point(point, size, rgb_img): 73 | # point: [2] \in [0,1] rgb_img: [h, w, 3] np.array 74 | out_img = rgb_img.copy() 75 | h, w = size 76 | point_scale = (point * np.array([w, h])).astype(int) 77 | out_img = cv2.circle(out_img, (point_scale[0], point_scale[1]), 3, (255, 0, 0), -1) 78 | return out_img 79 | 80 | 81 | def viz_mask(mask, rgb_img=None): 82 | # mask: [h, w] \in {0,1} rgb_img: [h, w, 3] np.array 83 | out_img = rgb_img.copy() 84 | mask_color = np.zeros((mask.shape[0], mask.shape[1], 3)) 85 | mask_color[:, :, 0] = mask * 255 86 | out_img = cv2.addWeighted(out_img, 1, mask_color.astype(np.uint8), 0.6, 1) 87 | return out_img 88 | 89 | 90 | def img_recover(img): 91 | # img: tensor [3, h, w] 92 | mean = torch.tensor([0.485, 0.456, 0.406])[:, None, None].to(img.device) 93 | std = torch.tensor([0.229, 0.224, 0.225])[:, None, None].to(img.device) 94 | img_recovered = (img * std + mean) * 255 95 | img_recovered = img_recovered.byte().permute(1, 2, 0).cpu().numpy() 96 | return img_recovered 97 | 98 | 99 | def visualize(samples, targets): 100 | viz_dict = {} 101 | for i, (frames, target) in enumerate(zip(samples.tensors, targets)): 102 | h, w = target["size"].numpy() 103 | if "valid_indices" in target.keys(): 104 | valid_frame = frames.index_select(0, target["valid_indices"]) # [1, 3, h, w] 105 | frames = valid_frame[:, :, :h, :w] 106 | else: 107 | frames = frames[:, :, :h, :w] 108 | for frame_id, frame in enumerate(frames): 109 | rgb_frame = img_recover(frame) # [h, w, 3] np.array 110 | rgb_frame = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR) 111 | masks, boxes, weak_masks, weights = ( 112 | target["masks"].cpu().numpy(), 113 | target["boxes"].cpu().numpy(), 114 | target["weak_masks"].cpu().numpy(), 115 | target["weights"].cpu().numpy(), 116 | ) 117 | # weak_mask_box = generate_mask_from_heatmap(heat_bbox) 118 | # weak_mask_point = generate_mask_from_heatmap(heat_point) 119 | for j, (mask, box, weight, weak_m) in enumerate( 120 | zip( 121 | masks, 122 | boxes, 123 | weights, 124 | weak_masks 125 | ) 126 | ): 127 | weight_p = generate_ce_weight(weight, (h, w), box) 128 | img_masked = viz_mask(mask, rgb_frame) 129 | img_bbox = viz_bbox(box, (h, w), rgb_frame) 130 | img_masked_weak = viz_mask(weak_m, rgb_frame) 131 | img_heat = viz_heatmap(weight, rgb_frame) 132 | img_heat_p = viz_heatmap(weight_p, rgb_frame) 133 | final_viz = np.concatenate( 134 | [ 135 | img_masked, 136 | img_bbox, 137 | img_heat, 138 | img_masked_weak, 139 | img_heat_p 140 | ], 141 | axis=1, 142 | ) 143 | viz_dict["batch{}_frame{}_instance{}".format(i, frame_id, j)] = final_viz 144 | 145 | return viz_dict 146 | -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from codes in torch.utils.data.distributed 7 | # ------------------------------------------------------------------------ 8 | 9 | import os 10 | import math 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data.sampler import Sampler 14 | 15 | 16 | class DistributedSampler(Sampler): 17 | """Sampler that restricts data loading to a subset of the dataset. 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | .. note:: 23 | Dataset is assumed to be of constant size. 24 | Arguments: 25 | dataset: Dataset used for sampling. 26 | num_replicas (optional): Number of processes participating in 27 | distributed training. 28 | rank (optional): Rank of the current process within num_replicas. 29 | """ 30 | 31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 32 | if num_replicas is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | num_replicas = dist.get_world_size() 36 | if rank is None: 37 | if not dist.is_available(): 38 | raise RuntimeError("Requires distributed package to be available") 39 | rank = dist.get_rank() 40 | self.dataset = dataset 41 | self.num_replicas = num_replicas 42 | self.rank = rank 43 | self.epoch = 0 44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 45 | self.total_size = self.num_samples * self.num_replicas 46 | self.shuffle = shuffle 47 | 48 | def __iter__(self): 49 | if self.shuffle: 50 | # deterministically shuffle based on epoch 51 | g = torch.Generator() 52 | g.manual_seed(self.epoch) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | 74 | 75 | class NodeDistributedSampler(Sampler): 76 | """Sampler that restricts data loading to a subset of the dataset. 77 | It is especially useful in conjunction with 78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 79 | process can pass a DistributedSampler instance as a DataLoader sampler, 80 | and load a subset of the original dataset that is exclusive to it. 81 | .. note:: 82 | Dataset is assumed to be of constant size. 83 | Arguments: 84 | dataset: Dataset used for sampling. 85 | num_replicas (optional): Number of processes participating in 86 | distributed training. 87 | rank (optional): Rank of the current process within num_replicas. 88 | """ 89 | 90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 91 | if num_replicas is None: 92 | if not dist.is_available(): 93 | raise RuntimeError("Requires distributed package to be available") 94 | num_replicas = dist.get_world_size() 95 | if rank is None: 96 | if not dist.is_available(): 97 | raise RuntimeError("Requires distributed package to be available") 98 | rank = dist.get_rank() 99 | if local_rank is None: 100 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 101 | if local_size is None: 102 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 103 | self.dataset = dataset 104 | self.shuffle = shuffle 105 | self.num_replicas = num_replicas 106 | self.num_parts = local_size 107 | self.rank = rank 108 | self.local_rank = local_rank 109 | self.epoch = 0 110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 111 | self.total_size = self.num_samples * self.num_replicas 112 | 113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 114 | 115 | def __iter__(self): 116 | if self.shuffle: 117 | # deterministically shuffle based on epoch 118 | g = torch.Generator() 119 | g.manual_seed(self.epoch) 120 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 121 | else: 122 | indices = torch.arange(len(self.dataset)).tolist() 123 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 124 | 125 | # add extra samples to make it evenly divisible 126 | indices += indices[:(self.total_size_parts - len(indices))] 127 | assert len(indices) == self.total_size_parts 128 | 129 | # subsample 130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 131 | assert len(indices) == self.num_samples 132 | 133 | return iter(indices) 134 | 135 | def __len__(self): 136 | return self.num_samples 137 | 138 | def set_epoch(self, epoch): 139 | self.epoch = epoch 140 | -------------------------------------------------------------------------------- /davis2017/evaluation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | import warnings 4 | warnings.filterwarnings("ignore", category=RuntimeWarning) 5 | 6 | import numpy as np 7 | from davis2017.davis import DAVIS 8 | from davis2017.metrics import db_eval_boundary, db_eval_iou 9 | from davis2017 import utils 10 | from davis2017.results import Results 11 | from scipy.optimize import linear_sum_assignment 12 | 13 | 14 | class DAVISEvaluation(object): 15 | def __init__(self, davis_root, task, gt_set, sequences='all', codalab=False): 16 | """ 17 | Class to evaluate DAVIS sequences from a certain set and for a certain task 18 | :param davis_root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders. 19 | :param task: Task to compute the evaluation, chose between semi-supervised or unsupervised. 20 | :param gt_set: Set to compute the evaluation 21 | :param sequences: Sequences to consider for the evaluation, 'all' to use all the sequences in a set. 22 | """ 23 | self.davis_root = davis_root 24 | self.task = task 25 | self.dataset = DAVIS(root=davis_root, task=task, subset=gt_set, sequences=sequences, codalab=codalab) 26 | 27 | @staticmethod 28 | def _evaluate_semisupervised(all_gt_masks, all_res_masks, all_void_masks, metric): 29 | if all_res_masks.shape[0] > all_gt_masks.shape[0]: 30 | sys.stdout.write("\nIn your PNG files there is an index higher than the number of objects in the sequence!") 31 | sys.exit() 32 | elif all_res_masks.shape[0] < all_gt_masks.shape[0]: 33 | zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:])) 34 | all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0) 35 | j_metrics_res, f_metrics_res = np.zeros(all_gt_masks.shape[:2]), np.zeros(all_gt_masks.shape[:2]) 36 | for ii in range(all_gt_masks.shape[0]): 37 | if 'J' in metric: 38 | j_metrics_res[ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks) 39 | if 'F' in metric: 40 | f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks) 41 | return j_metrics_res, f_metrics_res 42 | 43 | @staticmethod 44 | def _evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric, max_n_proposals=20): 45 | if all_res_masks.shape[0] > max_n_proposals: 46 | sys.stdout.write(f"\nIn your PNG files there is an index higher than the maximum number ({max_n_proposals}) of proposals allowed!") 47 | sys.exit() 48 | elif all_res_masks.shape[0] < all_gt_masks.shape[0]: 49 | zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:])) 50 | all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0) 51 | j_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1])) 52 | f_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1])) 53 | for ii in range(all_gt_masks.shape[0]): 54 | for jj in range(all_res_masks.shape[0]): 55 | if 'J' in metric: 56 | j_metrics_res[jj, ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks) 57 | if 'F' in metric: 58 | f_metrics_res[jj, ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks) 59 | if 'J' in metric and 'F' in metric: 60 | all_metrics = (np.mean(j_metrics_res, axis=2) + np.mean(f_metrics_res, axis=2)) / 2 61 | else: 62 | all_metrics = np.mean(j_metrics_res, axis=2) if 'J' in metric else np.mean(f_metrics_res, axis=2) 63 | row_ind, col_ind = linear_sum_assignment(-all_metrics) 64 | return j_metrics_res[row_ind, col_ind, :], f_metrics_res[row_ind, col_ind, :] 65 | 66 | def evaluate(self, res_path, metric=('J', 'F'), debug=False): 67 | metric = metric if isinstance(metric, tuple) or isinstance(metric, list) else [metric] 68 | if 'T' in metric: 69 | raise ValueError('Temporal metric not supported!') 70 | if 'J' not in metric and 'F' not in metric: 71 | raise ValueError('Metric possible values are J for IoU or F for Boundary') 72 | 73 | # Containers 74 | metrics_res = {} 75 | if 'J' in metric: 76 | metrics_res['J'] = {"M": [], "R": [], "D": [], "M_per_object": {}} 77 | if 'F' in metric: 78 | metrics_res['F'] = {"M": [], "R": [], "D": [], "M_per_object": {}} 79 | 80 | # Sweep all sequences 81 | results = Results(root_dir=res_path) 82 | for seq in tqdm(list(self.dataset.get_sequences())): 83 | all_gt_masks, all_void_masks, all_masks_id = self.dataset.get_all_masks(seq, True) 84 | if self.task == 'semi-supervised': 85 | all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1] 86 | all_res_masks = results.read_masks(seq, all_masks_id) 87 | if self.task == 'unsupervised': 88 | j_metrics_res, f_metrics_res = self._evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric) 89 | elif self.task == 'semi-supervised': 90 | j_metrics_res, f_metrics_res = self._evaluate_semisupervised(all_gt_masks, all_res_masks, None, metric) 91 | for ii in range(all_gt_masks.shape[0]): 92 | seq_name = f'{seq}_{ii+1}' 93 | if 'J' in metric: 94 | [JM, JR, JD] = utils.db_statistics(j_metrics_res[ii]) 95 | metrics_res['J']["M"].append(JM) 96 | metrics_res['J']["R"].append(JR) 97 | metrics_res['J']["D"].append(JD) 98 | metrics_res['J']["M_per_object"][seq_name] = JM 99 | if 'F' in metric: 100 | [FM, FR, FD] = utils.db_statistics(f_metrics_res[ii]) 101 | metrics_res['F']["M"].append(FM) 102 | metrics_res['F']["R"].append(FR) 103 | metrics_res['F']["D"].append(FD) 104 | metrics_res['F']["M_per_object"][seq_name] = FM 105 | 106 | # Show progress 107 | if debug: 108 | sys.stdout.write(seq + '\n') 109 | sys.stdout.flush() 110 | return metrics_res 111 | -------------------------------------------------------------------------------- /davis2017/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | import numpy as np 4 | from PIL import Image 5 | import warnings 6 | from davis2017.davis import DAVIS 7 | 8 | 9 | def _pascal_color_map(N=256, normalized=False): 10 | """ 11 | Python implementation of the color map function for the PASCAL VOC data set. 12 | Official Matlab version can be found in the PASCAL VOC devkit 13 | http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit 14 | """ 15 | 16 | def bitget(byteval, idx): 17 | return (byteval & (1 << idx)) != 0 18 | 19 | dtype = 'float32' if normalized else 'uint8' 20 | cmap = np.zeros((N, 3), dtype=dtype) 21 | for i in range(N): 22 | r = g = b = 0 23 | c = i 24 | for j in range(8): 25 | r = r | (bitget(c, 0) << 7 - j) 26 | g = g | (bitget(c, 1) << 7 - j) 27 | b = b | (bitget(c, 2) << 7 - j) 28 | c = c >> 3 29 | 30 | cmap[i] = np.array([r, g, b]) 31 | 32 | cmap = cmap / 255 if normalized else cmap 33 | return cmap 34 | 35 | 36 | def overlay_semantic_mask(im, ann, alpha=0.5, colors=None, contour_thickness=None): 37 | im, ann = np.asarray(im, dtype=np.uint8), np.asarray(ann, dtype=np.int) 38 | if im.shape[:-1] != ann.shape: 39 | raise ValueError('First two dimensions of `im` and `ann` must match') 40 | if im.shape[-1] != 3: 41 | raise ValueError('im must have three channels at the 3 dimension') 42 | 43 | colors = colors or _pascal_color_map() 44 | colors = np.asarray(colors, dtype=np.uint8) 45 | 46 | mask = colors[ann] 47 | fg = im * alpha + (1 - alpha) * mask 48 | 49 | img = im.copy() 50 | img[ann > 0] = fg[ann > 0] 51 | 52 | if contour_thickness: # pragma: no cover 53 | import cv2 54 | for obj_id in np.unique(ann[ann > 0]): 55 | contours = cv2.findContours((ann == obj_id).astype( 56 | np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[-2:] 57 | cv2.drawContours(img, contours[0], -1, colors[obj_id].tolist(), 58 | contour_thickness) 59 | return img 60 | 61 | 62 | def generate_obj_proposals(davis_root, subset, num_proposals, save_path): 63 | dataset = DAVIS(davis_root, subset=subset, codalab=True) 64 | for seq in dataset.get_sequences(): 65 | save_dir = os.path.join(save_path, seq) 66 | if os.path.exists(save_dir): 67 | continue 68 | all_gt_masks, all_masks_id = dataset.get_all_masks(seq, True) 69 | img_size = all_gt_masks.shape[2:] 70 | num_rows = int(np.ceil(np.sqrt(num_proposals))) 71 | proposals = np.zeros((num_proposals, len(all_masks_id), *img_size)) 72 | height_slices = np.floor(np.arange(0, img_size[0] + 1, img_size[0]/num_rows)).astype(np.uint).tolist() 73 | width_slices = np.floor(np.arange(0, img_size[1] + 1, img_size[1]/num_rows)).astype(np.uint).tolist() 74 | ii = 0 75 | prev_h, prev_w = 0, 0 76 | for h in height_slices[1:]: 77 | for w in width_slices[1:]: 78 | proposals[ii, :, prev_h:h, prev_w:w] = 1 79 | prev_w = w 80 | ii += 1 81 | if ii == num_proposals: 82 | break 83 | prev_h, prev_w = h, 0 84 | if ii == num_proposals: 85 | break 86 | 87 | os.makedirs(save_dir, exist_ok=True) 88 | for i, mask_id in enumerate(all_masks_id): 89 | mask = np.sum(proposals[:, i, ...] * np.arange(1, proposals.shape[0] + 1)[:, None, None], axis=0) 90 | save_mask(mask, os.path.join(save_dir, f'{mask_id}.png')) 91 | 92 | 93 | def generate_random_permutation_gt_obj_proposals(davis_root, subset, save_path): 94 | dataset = DAVIS(davis_root, subset=subset, codalab=True) 95 | for seq in dataset.get_sequences(): 96 | gt_masks, all_masks_id = dataset.get_all_masks(seq, True) 97 | obj_swap = np.random.permutation(np.arange(gt_masks.shape[0])) 98 | gt_masks = gt_masks[obj_swap, ...] 99 | save_dir = os.path.join(save_path, seq) 100 | os.makedirs(save_dir, exist_ok=True) 101 | for i, mask_id in enumerate(all_masks_id): 102 | mask = np.sum(gt_masks[:, i, ...] * np.arange(1, gt_masks.shape[0] + 1)[:, None, None], axis=0) 103 | save_mask(mask, os.path.join(save_dir, f'{mask_id}.png')) 104 | 105 | 106 | def color_map(N=256, normalized=False): 107 | def bitget(byteval, idx): 108 | return ((byteval & (1 << idx)) != 0) 109 | 110 | dtype = 'float32' if normalized else 'uint8' 111 | cmap = np.zeros((N, 3), dtype=dtype) 112 | for i in range(N): 113 | r = g = b = 0 114 | c = i 115 | for j in range(8): 116 | r = r | (bitget(c, 0) << 7-j) 117 | g = g | (bitget(c, 1) << 7-j) 118 | b = b | (bitget(c, 2) << 7-j) 119 | c = c >> 3 120 | 121 | cmap[i] = np.array([r, g, b]) 122 | 123 | cmap = cmap/255 if normalized else cmap 124 | return cmap 125 | 126 | 127 | def save_mask(mask, img_path): 128 | if np.max(mask) > 255: 129 | raise ValueError('Maximum id pixel value is 255') 130 | mask_img = Image.fromarray(mask.astype(np.uint8)) 131 | mask_img.putpalette(color_map().flatten().tolist()) 132 | mask_img.save(img_path) 133 | 134 | 135 | def db_statistics(per_frame_values): 136 | """ Compute mean,recall and decay from per-frame evaluation. 137 | Arguments: 138 | per_frame_values (ndarray): per-frame evaluation 139 | 140 | Returns: 141 | M,O,D (float,float,float): 142 | return evaluation statistics: mean,recall,decay. 143 | """ 144 | 145 | # strip off nan values 146 | with warnings.catch_warnings(): 147 | warnings.simplefilter("ignore", category=RuntimeWarning) 148 | M = np.nanmean(per_frame_values) 149 | O = np.nanmean(per_frame_values > 0.5) 150 | 151 | N_bins = 4 152 | ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1 153 | ids = ids.astype(np.uint8) 154 | 155 | D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)] 156 | 157 | with warnings.catch_warnings(): 158 | warnings.simplefilter("ignore", category=RuntimeWarning) 159 | D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3]) 160 | 161 | return M, O, D 162 | 163 | 164 | def list_files(dir, extension=".png"): 165 | return [os.path.splitext(file_)[0] for file_ in os.listdir(dir) if file_.endswith(extension)] 166 | 167 | 168 | def force_symlink(file1, file2): 169 | try: 170 | os.symlink(file1, file2) 171 | except OSError as e: 172 | if e.errno == errno.EEXIST: 173 | os.remove(file2) 174 | os.symlink(file1, file2) 175 | -------------------------------------------------------------------------------- /pre_process/sim_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | try: 5 | from frozen_batchnorm2d import FrozenBatchNorm2d 6 | except: 7 | from .frozen_batchnorm2d import FrozenBatchNorm2d 8 | from torchvision.models._utils import IntermediateLayerGetter 9 | import math 10 | import numpy as np 11 | import torch.nn.functional as F 12 | 13 | 14 | class SimModel(nn.Module): 15 | def __init__(self, backbone, dilation=False, background_thres=0.5): 16 | super().__init__() 17 | self.background_thres = background_thres 18 | backbone = getattr(torchvision.models, backbone)( 19 | replace_stride_with_dilation=[False, False, dilation], 20 | pretrained=True, 21 | norm_layer=FrozenBatchNorm2d, 22 | ) 23 | return_layers = { 24 | "layer1": "feat1", 25 | "layer2": "feat2", 26 | "layer3": "feat3", 27 | "layer4": "feat4", 28 | } 29 | self.backbone = IntermediateLayerGetter(backbone, return_layers=return_layers) 30 | checkpoint_path = "checkpoints/densecl_r101_imagenet_200ep.pth" 31 | self.backbone.load_state_dict( 32 | torch.load(checkpoint_path)["state_dict"], strict=False 33 | ) 34 | 35 | def forward_point(self, x, point_list, valid): 36 | fea = self.backbone(x)["feat4"] 37 | keys = fea ### [B, C3, H3, W3] 38 | b, c, h, w = keys.shape 39 | out_masks = [] 40 | for n_p, point_loc in enumerate(point_list): 41 | if valid[n_p]: 42 | scale_factors = [1.0] 43 | queries_list = [] 44 | for scale_factor in scale_factors: 45 | point_cur = [ 46 | int(point_loc[0] * w * scale_factor), 47 | int(point_loc[1] * h * scale_factor), 48 | ] 49 | cur_queries = keys[:, :, point_cur[1], point_cur[0]] 50 | queries_list.append(cur_queries) 51 | 52 | queries = torch.stack(queries_list, dim=1) 53 | 54 | keys = keys / keys.norm(dim=1, keepdim=True) 55 | queries = queries / queries.norm(dim=-1, keepdim=True) 56 | attn = torch.matmul(queries, keys.view(b, c, -1)) 57 | attn = (attn - attn.min(-1, keepdim=True)[0]) / attn.max( 58 | -1, keepdim=True 59 | )[0] 60 | soft_masks = attn.reshape(b, attn.shape[1], h, w) 61 | out_masks.append(soft_masks) 62 | else: 63 | out_masks.append(torch.zeros((b, 1, h, w)).to(x.device)) 64 | out_masks = torch.cat(out_masks, dim=1) 65 | return out_masks 66 | 67 | def forward_bbox(self, x, bbox_list, valid): 68 | h_ori, w_ori = x.shape[-2:] 69 | fea = self.backbone(x)["feat4"] 70 | keys = fea ### [B, C3, H3, W3] 71 | b, c, h, w = keys.shape 72 | out_masks = [] 73 | for n_b, bbox in enumerate(bbox_list): 74 | if valid[n_b]: 75 | 76 | scale_factors = [1.0] 77 | queries_list = [] 78 | bbox_masks = [] 79 | for scale_factor in scale_factors: 80 | box_cur = [ 81 | int(bbox[0] / w_ori * w * scale_factor), 82 | int(bbox[1] / h_ori * h * scale_factor), 83 | int(bbox[2] / w_ori * w * scale_factor), 84 | int(bbox[3] / h_ori * h * scale_factor), 85 | ] 86 | bbox_mask = torch.zeros((h, w)).bool().to(x.device) 87 | bbox_mask[box_cur[1] : box_cur[3], box_cur[0] : box_cur[2]] = True 88 | range_x = list(range(box_cur[0], box_cur[2] + 1)) 89 | range_y = list(range(box_cur[1], box_cur[3] + 1)) 90 | i = 1 91 | while(len(range_x) * len(range_y) > 256): 92 | range_x = list(range(box_cur[0], box_cur[2] + 1, i+1)) 93 | range_y = list(range(box_cur[1], box_cur[3] + 1, i+1)) 94 | i += 1 95 | x_candi = torch.tensor(range_x) 96 | y_candi = torch.tensor(range_y) 97 | gridx, gridy = torch.meshgrid(x_candi, y_candi) 98 | locs = torch.stack([gridx, gridy], dim=-1).flatten(0, 1) # [N, 2] 99 | for loc in locs: 100 | cur_queries = keys[:, :, loc[1], loc[0]] 101 | queries_list.append(cur_queries) 102 | bbox_masks.append(bbox_mask) 103 | queries = torch.stack(queries_list, dim=1) # [b, n, d] 104 | bbox_masks = torch.stack(bbox_masks, dim=0)[None] # [1, n, h, w] 105 | bbox_masks_flatten = bbox_masks.flatten(-2) 106 | 107 | keys = keys / keys.norm(dim=1, keepdim=True) 108 | queries = queries / queries.norm(dim=-1, keepdim=True) 109 | attn = torch.matmul(queries, keys.view(b, c, -1)) 110 | attn = (attn - attn.min(-1, keepdim=True)[0]) / attn.max( 111 | -1, keepdim=True 112 | )[0] 113 | 114 | attn_reshape = attn.reshape(b, attn.shape[1], h, w) 115 | 116 | attn_scale = attn_reshape 117 | attn_x = attn_scale.max(dim=-2)[0] 118 | attn_y = attn_scale.max(dim=-1)[0] 119 | 120 | score_x = (attn_x * bbox_masks.max(dim=-2)[0]).sum(dim=-1) / ((attn_x + bbox_masks.max(dim=-2)[0] - attn_x * bbox_masks.max(dim=-2)[0]).sum(dim=-1) + 1e-5) 121 | score_y = (attn_y * bbox_masks.max(dim=-1)[0]).sum(dim=-1) / ((attn_y + bbox_masks.max(dim=-1)[0] - attn_y * bbox_masks.max(dim=-1)[0]).sum(dim=-1) + 1e-5) 122 | score = (score_x + score_y) / 2 123 | 124 | _, max_loc = torch.topk(score, 1, 1) 125 | attn_selected = torch.gather( 126 | attn, 1, max_loc.unsqueeze(-1).repeat(1, 1, attn.shape[-1]) 127 | ) 128 | 129 | soft_masks = attn_selected.reshape(b, attn_selected.shape[1], h, w) 130 | out_masks.append(soft_masks) 131 | else: 132 | out_masks.append(torch.zeros((b, 1, h, w)).to(x.device)) 133 | out_masks = torch.cat(out_masks, dim=1) 134 | return out_masks 135 | 136 | def forward(self, x, query_list, valid, mode="point"): 137 | if mode == "point": 138 | out_masks = self.forward_point(x, query_list, valid) 139 | elif mode == "bbox": 140 | out_masks = self.forward_bbox(x, query_list, valid) 141 | return out_masks 142 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various positional encodings for the transformer. 3 | Modified from DETR (https://github.com/facebookresearch/detr) 4 | """ 5 | import math 6 | import torch 7 | from torch import nn 8 | 9 | from util.misc import NestedTensor 10 | 11 | # dimension == 1 12 | class PositionEmbeddingSine1D(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | def __init__(self, num_pos_feats=256, temperature=10000, normalize=False, scale=None): 18 | super().__init__() 19 | self.num_pos_feats = num_pos_feats 20 | self.temperature = temperature 21 | self.normalize = normalize 22 | if scale is not None and normalize is False: 23 | raise ValueError("normalize should be True if scale is passed") 24 | if scale is None: 25 | scale = 2 * math.pi 26 | self.scale = scale 27 | 28 | def forward(self, tensor_list: NestedTensor): 29 | x = tensor_list.tensors # [B, C, T] 30 | mask = tensor_list.mask # [B, T] 31 | assert mask is not None 32 | not_mask = ~mask 33 | x_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, T] 34 | if self.normalize: 35 | eps = 1e-6 36 | x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale 37 | 38 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 39 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 40 | 41 | pos_x = x_embed[:, :, None] / dim_t # [B, T, C] 42 | # n,c,t 43 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 44 | pos = pos_x.permute(0, 2, 1) # [B, C, T] 45 | return pos 46 | 47 | # dimension == 2 48 | class PositionEmbeddingSine2D(nn.Module): 49 | """ 50 | This is a more standard version of the position embedding, very similar to the one 51 | used by the Attention is all you need paper, generalized to work on images. 52 | """ 53 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 54 | super().__init__() 55 | self.num_pos_feats = num_pos_feats 56 | self.temperature = temperature 57 | self.normalize = normalize 58 | if scale is not None and normalize is False: 59 | raise ValueError("normalize should be True if scale is passed") 60 | if scale is None: 61 | scale = 2 * math.pi 62 | self.scale = scale 63 | 64 | def forward(self, tensor_list: NestedTensor): 65 | x = tensor_list.tensors # [B, C, H, W] 66 | mask = tensor_list.mask # [B, H, W] 67 | assert mask is not None 68 | not_mask = ~mask 69 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 70 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 71 | if self.normalize: 72 | eps = 1e-6 73 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 74 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 75 | 76 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 77 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 78 | 79 | pos_x = x_embed[:, :, :, None] / dim_t 80 | pos_y = y_embed[:, :, :, None] / dim_t 81 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 82 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 83 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 84 | return pos # [B, C, H, W] 85 | 86 | 87 | # dimension == 3 88 | class PositionEmbeddingSine3D(nn.Module): 89 | """ 90 | This is a more standard version of the position embedding, very similar to the one 91 | used by the Attention is all you need paper, generalized to work on images. 92 | """ 93 | def __init__(self, num_pos_feats=64, num_frames=36, temperature=10000, normalize=False, scale=None): 94 | super().__init__() 95 | self.num_pos_feats = num_pos_feats 96 | self.temperature = temperature 97 | self.normalize = normalize 98 | self.frames = num_frames 99 | if scale is not None and normalize is False: 100 | raise ValueError("normalize should be True if scale is passed") 101 | if scale is None: 102 | scale = 2 * math.pi 103 | self.scale = scale 104 | 105 | def forward(self, tensor_list: NestedTensor): 106 | x = tensor_list.tensors # [B*T, C, H, W] 107 | mask = tensor_list.mask # [B*T, H, W] 108 | n,h,w = mask.shape 109 | mask = mask.reshape(n//self.frames, self.frames,h,w) # [B, T, H, W] 110 | assert mask is not None 111 | not_mask = ~mask 112 | z_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, T, H, W] 113 | y_embed = not_mask.cumsum(2, dtype=torch.float32) # [B, T, H, W] 114 | x_embed = not_mask.cumsum(3, dtype=torch.float32) # [B, T, H, W] 115 | if self.normalize: 116 | eps = 1e-6 117 | z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale 118 | y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale 119 | x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale 120 | 121 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) # 122 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 123 | 124 | pos_x = x_embed[:, :, :, :, None] / dim_t # [B, T, H, W, c] 125 | pos_y = y_embed[:, :, :, :, None] / dim_t 126 | pos_z = z_embed[:, :, :, :, None] / dim_t 127 | pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) # [B, T, H, W, c] 128 | pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 129 | pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 130 | pos = torch.cat((pos_z, pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3) # [B, T, C, H, W] 131 | return pos 132 | 133 | 134 | 135 | def build_position_encoding(args): 136 | # build 2D position encoding 137 | N_steps = args.hidden_dim // 2 # 256 / 2 = 128 138 | if args.position_embedding in ('v2', 'sine'): 139 | # TODO find a better way of exposing other arguments 140 | position_embedding = PositionEmbeddingSine2D(N_steps, normalize=True) 141 | else: 142 | raise ValueError(f"not supported {args.position_embedding}") 143 | 144 | return position_embedding 145 | -------------------------------------------------------------------------------- /models/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # Modify for sample points visualization 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import print_function 12 | from __future__ import division 13 | 14 | import warnings 15 | import math 16 | 17 | import torch 18 | from torch import nn 19 | import torch.nn.functional as F 20 | from torch.nn.init import xavier_uniform_, constant_ 21 | 22 | from ..functions import MSDeformAttnFunction 23 | 24 | 25 | def _is_power_of_2(n): 26 | if (not isinstance(n, int)) or (n < 0): 27 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 28 | return (n & (n-1) == 0) and n != 0 29 | 30 | 31 | class MSDeformAttn(nn.Module): 32 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 33 | """ 34 | Multi-Scale Deformable Attention Module 35 | :param d_model hidden dimension 36 | :param n_levels number of feature levels 37 | :param n_heads number of attention heads 38 | :param n_points number of sampling points per attention head per feature level 39 | """ 40 | super().__init__() 41 | if d_model % n_heads != 0: 42 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 43 | _d_per_head = d_model // n_heads 44 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 45 | if not _is_power_of_2(_d_per_head): 46 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 47 | "which is more efficient in our CUDA implementation.") 48 | 49 | self.im2col_step = 64 50 | 51 | self.d_model = d_model 52 | self.n_levels = n_levels 53 | self.n_heads = n_heads 54 | self.n_points = n_points 55 | 56 | # res = sum(attn * W*(delta p)) 57 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) # delta p 58 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) # attn 59 | self.value_proj = nn.Linear(d_model, d_model) 60 | self.output_proj = nn.Linear(d_model, d_model) 61 | 62 | self._reset_parameters() 63 | 64 | def _reset_parameters(self): 65 | constant_(self.sampling_offsets.weight.data, 0.) 66 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 67 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 68 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 69 | for i in range(self.n_points): 70 | grid_init[:, :, i, :] *= i + 1 71 | with torch.no_grad(): 72 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 73 | constant_(self.attention_weights.weight.data, 0.) 74 | constant_(self.attention_weights.bias.data, 0.) 75 | xavier_uniform_(self.value_proj.weight.data) 76 | constant_(self.value_proj.bias.data, 0.) 77 | xavier_uniform_(self.output_proj.weight.data) 78 | constant_(self.output_proj.bias.data, 0.) 79 | 80 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 81 | """ 82 | :param query (N, Length_{query}, C) 83 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 84 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 85 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 86 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 87 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 88 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 89 | 90 | :return output (N, Length_{query}, C) 91 | """ 92 | N, Len_q, _ = query.shape 93 | N, Len_in, _ = input_flatten.shape 94 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 95 | 96 | value = self.value_proj(input_flatten) 97 | if input_padding_mask is not None: 98 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 99 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 100 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 101 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 102 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 103 | # N, Len_q, n_heads, n_levels, n_points, 2 104 | if reference_points.shape[-1] == 2: 105 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 106 | sampling_locations = reference_points[:, :, None, :, None, :] \ 107 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 108 | elif reference_points.shape[-1] == 4: 109 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 110 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 111 | else: 112 | raise ValueError( 113 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 114 | output = MSDeformAttnFunction.apply( 115 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 116 | output = self.output_proj(output) 117 | 118 | return output, sampling_locations, attention_weights 119 | -------------------------------------------------------------------------------- /models/ops/build/lib.linux-x86_64-cpython-38/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # Modify for sample points visualization 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import print_function 12 | from __future__ import division 13 | 14 | import warnings 15 | import math 16 | 17 | import torch 18 | from torch import nn 19 | import torch.nn.functional as F 20 | from torch.nn.init import xavier_uniform_, constant_ 21 | 22 | from ..functions import MSDeformAttnFunction 23 | 24 | 25 | def _is_power_of_2(n): 26 | if (not isinstance(n, int)) or (n < 0): 27 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 28 | return (n & (n-1) == 0) and n != 0 29 | 30 | 31 | class MSDeformAttn(nn.Module): 32 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 33 | """ 34 | Multi-Scale Deformable Attention Module 35 | :param d_model hidden dimension 36 | :param n_levels number of feature levels 37 | :param n_heads number of attention heads 38 | :param n_points number of sampling points per attention head per feature level 39 | """ 40 | super().__init__() 41 | if d_model % n_heads != 0: 42 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 43 | _d_per_head = d_model // n_heads 44 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 45 | if not _is_power_of_2(_d_per_head): 46 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 47 | "which is more efficient in our CUDA implementation.") 48 | 49 | self.im2col_step = 64 50 | 51 | self.d_model = d_model 52 | self.n_levels = n_levels 53 | self.n_heads = n_heads 54 | self.n_points = n_points 55 | 56 | # res = sum(attn * W*(delta p)) 57 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) # delta p 58 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) # attn 59 | self.value_proj = nn.Linear(d_model, d_model) 60 | self.output_proj = nn.Linear(d_model, d_model) 61 | 62 | self._reset_parameters() 63 | 64 | def _reset_parameters(self): 65 | constant_(self.sampling_offsets.weight.data, 0.) 66 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 67 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 68 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 69 | for i in range(self.n_points): 70 | grid_init[:, :, i, :] *= i + 1 71 | with torch.no_grad(): 72 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 73 | constant_(self.attention_weights.weight.data, 0.) 74 | constant_(self.attention_weights.bias.data, 0.) 75 | xavier_uniform_(self.value_proj.weight.data) 76 | constant_(self.value_proj.bias.data, 0.) 77 | xavier_uniform_(self.output_proj.weight.data) 78 | constant_(self.output_proj.bias.data, 0.) 79 | 80 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 81 | """ 82 | :param query (N, Length_{query}, C) 83 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 84 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 85 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 86 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 87 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 88 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 89 | 90 | :return output (N, Length_{query}, C) 91 | """ 92 | N, Len_q, _ = query.shape 93 | N, Len_in, _ = input_flatten.shape 94 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 95 | 96 | value = self.value_proj(input_flatten) 97 | if input_padding_mask is not None: 98 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 99 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 100 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 101 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 102 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 103 | # N, Len_q, n_heads, n_levels, n_points, 2 104 | if reference_points.shape[-1] == 2: 105 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 106 | sampling_locations = reference_points[:, :, None, :, None, :] \ 107 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 108 | elif reference_points.shape[-1] == 4: 109 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 110 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 111 | else: 112 | raise ValueError( 113 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 114 | output = MSDeformAttnFunction.apply( 115 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 116 | output = self.output_proj(output) 117 | 118 | return output, sampling_locations, attention_weights 119 | -------------------------------------------------------------------------------- /datasets/refexp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | COCO dataset which returns image_id for evaluation. 5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 6 | """ 7 | from pathlib import Path 8 | 9 | import torch 10 | import torch.utils.data 11 | import torchvision 12 | from pycocotools import mask as coco_mask 13 | 14 | import datasets.transforms_image as T 15 | 16 | 17 | class ModulatedDetection(torchvision.datasets.CocoDetection): 18 | def __init__(self, img_folder, ann_file, transforms, return_masks): 19 | super(ModulatedDetection, self).__init__(img_folder, ann_file) 20 | self._transforms = transforms 21 | self.prepare = ConvertCocoPolysToMask(return_masks) 22 | 23 | def __getitem__(self, idx): 24 | instance_check = False 25 | while not instance_check: 26 | img, target = super(ModulatedDetection, self).__getitem__(idx) 27 | image_id = self.ids[idx] 28 | coco_img = self.coco.loadImgs(image_id)[0] 29 | caption = coco_img["caption"] 30 | dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None 31 | target = {"image_id": image_id, "annotations": target, "caption": caption} 32 | img, target = self.prepare(img, target) 33 | if self._transforms is not None: 34 | img, target = self._transforms(img, target) 35 | target["dataset_name"] = dataset_name 36 | for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]: 37 | if extra_key in coco_img: 38 | target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh 39 | # FIXME: handle "valid", since some box may be removed due to random crop 40 | target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0]) 41 | 42 | if torch.any(target['valid'] == 1): # at leatst one instance 43 | instance_check = True 44 | else: 45 | import random 46 | idx = random.randint(0, self.__len__() - 1) 47 | return img.unsqueeze(0), target 48 | # return img: [1, 3, H, W], the first dimension means T = 1. 49 | 50 | 51 | def convert_coco_poly_to_mask(segmentations, height, width): 52 | masks = [] 53 | for polygons in segmentations: 54 | rles = coco_mask.frPyObjects(polygons, height, width) 55 | mask = coco_mask.decode(rles) 56 | if len(mask.shape) < 3: 57 | mask = mask[..., None] 58 | mask = torch.as_tensor(mask, dtype=torch.uint8) 59 | mask = mask.any(dim=2) 60 | masks.append(mask) 61 | if masks: 62 | masks = torch.stack(masks, dim=0) 63 | else: 64 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 65 | return masks 66 | 67 | 68 | class ConvertCocoPolysToMask(object): 69 | def __init__(self, return_masks=False): 70 | self.return_masks = return_masks 71 | 72 | def __call__(self, image, target): 73 | w, h = image.size 74 | 75 | image_id = target["image_id"] 76 | image_id = torch.tensor([image_id]) 77 | 78 | anno = target["annotations"] 79 | caption = target["caption"] if "caption" in target else None 80 | 81 | anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] 82 | 83 | boxes = [obj["bbox"] for obj in anno] 84 | # guard against no boxes via resizing 85 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 86 | boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy 87 | boxes[:, 0::2].clamp_(min=0, max=w) 88 | boxes[:, 1::2].clamp_(min=0, max=h) 89 | 90 | classes = [obj["category_id"] for obj in anno] 91 | classes = torch.tensor(classes, dtype=torch.int64) 92 | 93 | if self.return_masks: 94 | segmentations = [obj["segmentation"] for obj in anno] 95 | masks = convert_coco_poly_to_mask(segmentations, h, w) 96 | 97 | # keep the valid boxes 98 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 99 | boxes = boxes[keep] 100 | classes = classes[keep] 101 | if self.return_masks: 102 | masks = masks[keep] 103 | 104 | target = {} 105 | target["boxes"] = boxes 106 | target["labels"] = classes 107 | if caption is not None: 108 | target["caption"] = caption 109 | if self.return_masks: 110 | target["masks"] = masks 111 | target["image_id"] = image_id 112 | 113 | # for conversion to coco api 114 | area = torch.tensor([obj["area"] for obj in anno]) 115 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 116 | target["area"] = area[keep] 117 | target["iscrowd"] = iscrowd[keep] 118 | target["valid"] = torch.tensor([1]) 119 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 120 | target["size"] = torch.as_tensor([int(h), int(w)]) 121 | return image, target 122 | 123 | 124 | def make_coco_transforms(image_set, cautious): 125 | 126 | normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) 127 | 128 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768] 129 | final_scales = [296, 328, 360, 392, 416, 448, 480, 512] 130 | 131 | max_size = 800 132 | if image_set == "train": 133 | horizontal = [] if cautious else [T.RandomHorizontalFlip()] 134 | return T.Compose( 135 | horizontal 136 | + [ 137 | T.RandomSelect( 138 | T.RandomResize(scales, max_size=max_size), 139 | T.Compose( 140 | [ 141 | T.RandomResize([400, 500, 600]), 142 | T.RandomSizeCrop(384, 600, respect_boxes=cautious), 143 | T.RandomResize(final_scales, max_size=640), 144 | ] 145 | ), 146 | ), 147 | normalize, 148 | ] 149 | ) 150 | 151 | if image_set == "val": 152 | return T.Compose( 153 | [ 154 | T.RandomResize([360], max_size=640), 155 | normalize, 156 | ] 157 | ) 158 | 159 | raise ValueError(f"unknown {image_set}") 160 | 161 | 162 | def build(dataset_file, image_set, args): 163 | root = Path(args.coco_path) 164 | assert root.exists(), f"provided COCO path {root} does not exist" 165 | mode = "instances" 166 | dataset = dataset_file 167 | PATHS = { 168 | "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"), 169 | "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"), 170 | } 171 | 172 | img_folder, ann_file = PATHS[image_set] 173 | dataset = ModulatedDetection( 174 | img_folder, 175 | ann_file, 176 | transforms=make_coco_transforms(image_set, False), 177 | return_masks=args.masks, 178 | ) 179 | return dataset --------------------------------------------------------------------------------