├── util
    ├── __init__.py
    ├── __pycache__
    │   ├── logger.cpython-38.pyc
    │   ├── misc.cpython-38.pyc
    │   ├── misc.cpython-39.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── box_ops.cpython-38.pyc
    │   ├── box_ops.cpython-39.pyc
    │   └── visualization.cpython-38.pyc
    ├── box_ops.py
    ├── logger.py
    └── visualization.py
├── pre_process
    ├── __init__.py
    ├── __pycache__
    │   ├── data.cpython-38.pyc
    │   ├── data.cpython-39.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── dense_crf.cpython-39.pyc
    │   ├── sim_model.cpython-38.pyc
    │   ├── sim_model.cpython-39.pyc
    │   ├── frozen_batchnorm2d.cpython-38.pyc
    │   ├── frozen_batchnorm2d.cpython-39.pyc
    │   └── generate_anno_a2d.cpython-39.pyc
    ├── dense_crf.py
    ├── frozen_batchnorm2d.py
    ├── data.py
    ├── generate_anno_ytvos.py
    └── sim_model.py
├── models
    ├── text_encoder
    │   ├── __init__.py
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── tokenizer.cpython-38.pyc
    │   │   └── text_encoder.cpython-38.pyc
    │   └── text_encoder.py
    ├── ops
    │   ├── MultiScaleDeformableAttention.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── PKG-INFO
    │   │   └── SOURCES.txt
    │   ├── modules
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── ms_deform_attn.cpython-38.pyc
    │   │   │   └── ms_deform_attn.cpython-39.pyc
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn.py
    │   ├── functions
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── ms_deform_attn_func.cpython-38.pyc
    │   │   │   └── ms_deform_attn_func.cpython-39.pyc
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn_func.py
    │   ├── dist
    │   │   └── MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg
    │   ├── build
    │   │   ├── lib.linux-x86_64-cpython-38
    │   │   │   ├── MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn.py
    │   │   │   └── functions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn_func.py
    │   │   └── temp.linux-x86_64-cpython-38
    │   │   │   └── media
    │   │   │       └── HardDisk_B
    │   │   │           └── Users
    │   │   │               └── wx
    │   │   │                   └── wwk_files
    │   │   │                       └── codes
    │   │   │                           └── referring_video_segmentation
    │   │   │                               └── sgmg_ablations
    │   │   │                                   └── models
    │   │   │                                       └── ops
    │   │   │                                           └── src
    │   │   │                                               ├── vision.o
    │   │   │                                               ├── cpu
    │   │   │                                                   └── ms_deform_attn_cpu.o
    │   │   │                                               └── cuda
    │   │   │                                                   └── ms_deform_attn_cuda.o
    │   ├── make.sh
    │   ├── src
    │   │   ├── vision.cpp
    │   │   ├── cuda
    │   │   │   └── ms_deform_attn_cuda.h
    │   │   ├── cpu
    │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   └── ms_deform_attn_cpu.cpp
    │   │   └── ms_deform_attn.h
    │   ├── setup.py
    │   └── test.py
    ├── __pycache__
    │   ├── ocpg.cpython-38.pyc
    │   ├── sgmg.cpython-38.pyc
    │   ├── sgmg.cpython-39.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── backbone.cpython-38.pyc
    │   ├── backbone.cpython-39.pyc
    │   ├── criterion.cpython-38.pyc
    │   ├── decoder.cpython-38.pyc
    │   ├── matcher.cpython-38.pyc
    │   ├── modules.cpython-38.pyc
    │   ├── postprocessors.cpython-38.pyc
    │   ├── segmentation.cpython-38.pyc
    │   ├── position_encoding.cpython-38.pyc
    │   ├── position_encoding.cpython-39.pyc
    │   ├── deformable_transformer.cpython-38.pyc
    │   ├── deformable_transformer.cpython-39.pyc
    │   └── video_swin_transformer.cpython-38.pyc
    ├── __init__.py
    ├── decoder.py
    ├── modules.py
    ├── backbone.py
    └── position_encoding.py
├── tools
    ├── colormap.py
    └── load_pretrained_weights.py
├── davis2017
    ├── __init__.py
    ├── __pycache__
    │   ├── davis.cpython-38.pyc
    │   ├── utils.cpython-38.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── metrics.cpython-38.pyc
    │   ├── results.cpython-38.pyc
    │   └── evaluation.cpython-38.pyc
    ├── results.py
    ├── davis.py
    ├── evaluation.py
    └── utils.py
├── README.md
├── datasets
    ├── __pycache__
    │   ├── a2d.cpython-38.pyc
    │   ├── a2d.cpython-39.pyc
    │   ├── davis.cpython-38.pyc
    │   ├── davis.cpython-39.pyc
    │   ├── jhmdb.cpython-38.pyc
    │   ├── jhmdb.cpython-39.pyc
    │   ├── refexp.cpython-38.pyc
    │   ├── refexp.cpython-39.pyc
    │   ├── ytvos.cpython-38.pyc
    │   ├── ytvos.cpython-39.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── a2d_eval.cpython-38.pyc
    │   ├── coco_eval.cpython-38.pyc
    │   ├── samplers.cpython-38.pyc
    │   ├── samplers.cpython-39.pyc
    │   ├── categories.cpython-38.pyc
    │   ├── categories.cpython-39.pyc
    │   ├── refexp2seq.cpython-38.pyc
    │   ├── refexp2seq.cpython-39.pyc
    │   ├── refexp_eval.cpython-38.pyc
    │   ├── concat_dataset.cpython-38.pyc
    │   ├── concat_dataset.cpython-39.pyc
    │   ├── transforms_image.cpython-38.pyc
    │   ├── transforms_image.cpython-39.pyc
    │   ├── transforms_video.cpython-38.pyc
    │   ├── transforms_video.cpython-39.pyc
    │   ├── image_to_seq_augmenter.cpython-38.pyc
    │   └── image_to_seq_augmenter.cpython-39.pyc
    ├── concat_dataset.py
    ├── __init__.py
    ├── refexp_eval.py
    ├── categories.py
    ├── a2d_eval.py
    ├── image_to_seq_augmenter.py
    ├── coco.py
    ├── samplers.py
    └── refexp.py
├── scripts
    ├── dist_test_ytvos_resnet101_boxsup.sh
    ├── dist_train_a2d_resnet101.sh
    ├── dist_train_a2d_resnet101_box.sh
    ├── dist_train_a2d_resnet101_boxsup.sh
    ├── dist_train_a2d_resnet101_pointsup.sh
    ├── dist_train_ytvos_resnet101.sh
    ├── dist_train_a2d_resnet101_freeze.sh
    ├── dist_test_ytvos_videoswinb.sh
    ├── dist_train_a2d_resnet101_boxvos.sh
    ├── dist_train_a2d_resnet101_boxlevelset.sh
    ├── dist_train_a2d_resnet101_boxinst_point.sh
    ├── dist_train_a2d_resnet101_pointsup_partialsup.sh
    ├── dist_train_a2d_resnet101_boxlevelset_point.sh
    ├── dist_test_a2d_resnet101.sh
    ├── dist_test_jhmdb_resnet101.sh
    ├── dist_train_scratch_ytvos_videoswin.sh
    ├── dist_train_ytvos_resnet101_boxsup.sh
    ├── dist_test_a2d_resnet101_boxsup.sh
    ├── dist_test_a2d_resnet101_pointsup.sh
    ├── dist_test_jhmdb_resnet101_boxsup.sh
    ├── dist_test_jhmdb_resnet101_pointsup.sh
    ├── dist_test_a2d_videoswint.sh
    ├── dist_train_a2d_videoswinb.sh
    ├── dist_train_ytvos_videoswin.sh
    ├── dist_train_ytvos_videoswinb.sh
    ├── dist_test_davis_resnet.sh
    └── dist_test_davis_videoswinb.sh
├── eval_davis.py
└── utils.py


/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pre_process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/text_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/colormap.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | 


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/davis2017/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | __version__ = '0.1.0'
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OCPG
2 | Weakly Supervised Referring Video Object Segmentation with Object-Centric Pseudo-Guidance
3 | 


--------------------------------------------------------------------------------
/models/__pycache__/ocpg.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/ocpg.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/sgmg.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/sgmg.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/sgmg.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/sgmg.cpython-39.pyc


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 | 


--------------------------------------------------------------------------------
/util/__pycache__/logger.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/logger.cpython-38.pyc


--------------------------------------------------------------------------------
/util/__pycache__/misc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/misc.cpython-38.pyc


--------------------------------------------------------------------------------
/util/__pycache__/misc.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/misc.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/a2d.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/a2d.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d.cpython-39.pyc


--------------------------------------------------------------------------------
/util/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/util/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/util/__pycache__/box_ops.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/box_ops.cpython-38.pyc


--------------------------------------------------------------------------------
/util/__pycache__/box_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/box_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/davis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/davis.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/davis.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/davis.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/jhmdb.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/jhmdb.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/jhmdb.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/jhmdb.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/refexp.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/refexp.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/ytvos.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/ytvos.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/ytvos.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/ytvos.cpython-39.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/davis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/davis.cpython-38.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/models/__pycache__/backbone.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/backbone.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/backbone.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/backbone.cpython-39.pyc


--------------------------------------------------------------------------------
/models/__pycache__/criterion.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/criterion.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/decoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/decoder.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/matcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/matcher.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/modules.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/data.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/data.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/data.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/a2d_eval.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/a2d_eval.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/coco_eval.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/coco_eval.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/samplers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/samplers.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/samplers.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/samplers.cpython-39.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/metrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/metrics.cpython-38.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/results.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/results.cpython-38.pyc


--------------------------------------------------------------------------------
/util/__pycache__/visualization.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/util/__pycache__/visualization.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/categories.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/categories.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/categories.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/categories.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/refexp2seq.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp2seq.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/refexp2seq.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp2seq.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/refexp_eval.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/refexp_eval.cpython-38.pyc


--------------------------------------------------------------------------------
/davis2017/__pycache__/evaluation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/davis2017/__pycache__/evaluation.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/postprocessors.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/postprocessors.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/segmentation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/segmentation.cpython-38.pyc


--------------------------------------------------------------------------------
/models/text_encoder/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/pre_process/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/dense_crf.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/dense_crf.cpython-39.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/sim_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/sim_model.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/sim_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/sim_model.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/concat_dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/concat_dataset.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/concat_dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/concat_dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/transforms_image.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_image.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/transforms_image.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_image.cpython-39.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/transforms_video.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_video.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/transforms_video.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/transforms_video.cpython-39.pyc


--------------------------------------------------------------------------------
/models/__pycache__/position_encoding.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/position_encoding.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/position_encoding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/position_encoding.cpython-39.pyc


--------------------------------------------------------------------------------
/models/ops/modules/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/modules/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/models/text_encoder/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/image_to_seq_augmenter.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/image_to_seq_augmenter.cpython-38.pyc


--------------------------------------------------------------------------------
/datasets/__pycache__/image_to_seq_augmenter.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/datasets/__pycache__/image_to_seq_augmenter.cpython-39.pyc


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .ocpg import build
2 | 
3 | def build_model(args):
4 |     print("\n **** BUILD MODEL FOR OCPG. ****  \n")
5 |     return build(args)
6 | 


--------------------------------------------------------------------------------
/models/__pycache__/deformable_transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/deformable_transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/models/__pycache__/deformable_transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/deformable_transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/models/__pycache__/video_swin_transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/__pycache__/video_swin_transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/functions/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/functions/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/models/text_encoder/__pycache__/tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/frozen_batchnorm2d.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/frozen_batchnorm2d.cpython-38.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/frozen_batchnorm2d.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/frozen_batchnorm2d.cpython-39.pyc


--------------------------------------------------------------------------------
/pre_process/__pycache__/generate_anno_a2d.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/pre_process/__pycache__/generate_anno_a2d.cpython-39.pyc


--------------------------------------------------------------------------------
/models/ops/modules/__pycache__/ms_deform_attn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/ms_deform_attn.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/modules/__pycache__/ms_deform_attn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/modules/__pycache__/ms_deform_attn.cpython-39.pyc


--------------------------------------------------------------------------------
/models/text_encoder/__pycache__/text_encoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/text_encoder/__pycache__/text_encoder.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-38.pyc


--------------------------------------------------------------------------------
/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/functions/__pycache__/ms_deform_attn_func.cpython-39.pyc


--------------------------------------------------------------------------------
/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/dist/MultiScaleDeformableAttention-1.0-py3.8-linux-x86_64.egg


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-38/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/lib.linux-x86_64-cpython-38/MultiScaleDeformableAttention.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: MultiScaleDeformableAttention
3 | Version: 1.0
4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
6 | Author: Weijie Su
7 | 


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.o


--------------------------------------------------------------------------------
/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/OCPG/HEAD/models/ops/build/temp.linux-x86_64-cpython-38/media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.o


--------------------------------------------------------------------------------
/tools/load_pretrained_weights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def pre_trained_model_to_finetune(checkpoint, args):
 4 |     checkpoint = checkpoint['model']
 5 |     # only delete the class_embed since the finetuned dataset has different num_classes
 6 |     num_layers = args.dec_layers + 1 if args.two_stage else args.dec_layers
 7 |     for l in range(num_layers):
 8 |         del checkpoint["class_embed.{}.weight".format(l)]
 9 |         del checkpoint["class_embed.{}.bias".format(l)]
10 |     
11 |     return checkpoint
12 | 


--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | python setup.py build install
11 | 


--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-38/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-38/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/vision.cpp
 3 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cpu/ms_deform_attn_cpu.cpp
 4 | /media/HardDisk_B/Users/wx/wwk_files/codes/referring_video_segmentation/sgmg_ablations/models/ops/src/cuda/ms_deform_attn_cuda.cu
 5 | MultiScaleDeformableAttention.egg-info/PKG-INFO
 6 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
 7 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
 8 | MultiScaleDeformableAttention.egg-info/top_level.txt
 9 | functions/__init__.py
10 | functions/ms_deform_attn_func.py
11 | modules/__init__.py
12 | modules/ms_deform_attn.py


--------------------------------------------------------------------------------
/scripts/dist_test_ytvos_resnet101_boxsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | GPUS_PER_NODE=1
 6 | CPUS_PER_TASK=6
 7 | PORT=29500
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval_ytvos_boxsup"
17 | CHECKPOINT="./results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0009.pth"
18 | python inference_ytvos.py --with_box_refine --binary --freeze_text_encoder \
19 |   --eval \
20 |   --ngpu=${GPUS_PER_NODE} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --resume=${CHECKPOINT} \
23 |   --backbone=${BACKBONE}
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_box.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/OCPG_${BACKBONE}_scratch_a2d_boxsup"
16 | EXP_NAME="OCPG_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=box \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_boxsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=box \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_pointsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=point \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/dist_train_ytvos_resnet101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25500
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth"
16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_ytvos"
17 | EXP_NAME="SgMg_${BACKBONE}_scratch"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --exp_name=${EXP_NAME} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file ytvos \
24 |   --amp


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_freeze.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/results/SgMg_${BACKBONE}_scratch_a2d_freeze"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --freeze_video_encoder \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 4 \
24 |   --epochs 12 --lr_drop 3 5 \
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/dist_test_ytvos_videoswinb.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | GPUS_PER_NODE=2
 7 | CPUS_PER_TASK=6
 8 | PORT=29500
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_b_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_eval"
18 | CHECKPOINT="./checkpoints/sgmg_videosiwnb_ytvos.pth"
19 | python inference_ytvos.py --with_box_refine --binary --freeze_text_encoder \
20 |   --eval \
21 |   --ngpu=${GPUS_PER_NODE} \
22 |   --output_dir=${OUTPUT_DIR} \
23 |   --resume=${CHECKPOINT} \
24 |   --backbone=${BACKBONE} \
25 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
26 |   --amp \
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_boxvos.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxvos"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=box \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 |   --supervision_type boxvos
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_boxlevelset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxlevelset"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=box \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 |   --supervision_type boxlevelset
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_boxinst_point.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxinst_point"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=point \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 |   --supervision_type boxinst
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_pointsup_partialsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | PORT=25501
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup_partialsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=point \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 |   --supervision_type partialsup
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_resnet101_boxlevelset_point.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_boxsup_boxlevelset_point"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
18 |   --with_box_refine --binary --freeze_text_encoder --supervision=point \
19 |   --exp_name=${EXP_NAME} \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --backbone=${BACKBONE} \
22 |   --dataset_file a2d \
23 |   --batch_size 2 \
24 |   --epochs 12 --lr_drop 3 5 \
25 |   --supervision_type boxlevelset
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/dist_test_a2d_resnet101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d/checkpoint0011.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file a2d \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_test_jhmdb_resnet101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d/checkpoint0011.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file jhmdb \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_train_scratch_ytvos_videoswin.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | PORT=25500
 7 | GPUS_PER_NODE=2
 8 | CPUS_PER_TASK=6
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_t_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth"
17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_scratch"
18 | EXP_NAME="SgMg_${BACKBONE}_scratch"
19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
20 |   --with_box_refine --binary --freeze_text_encoder \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --exp_name=${EXP_NAME} \
23 |   --backbone=${BACKBONE} \
24 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
25 |   --dataset_file ytvos \
26 |   --amp
27 | 


--------------------------------------------------------------------------------
/scripts/dist_train_ytvos_resnet101_boxsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=2023
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth"
16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_ytvos_boxsup"
17 | EXP_NAME="SgMg_${BACKBONE}_scratch"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder --supervision=box \
20 |   --output_dir=${OUTPUT_DIR} \
21 |   --exp_name=${EXP_NAME} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file ytvos \
24 |   # --resume results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0000.pth


--------------------------------------------------------------------------------
/scripts/dist_test_a2d_resnet101_boxsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | PORT=29501
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval_scratch_a2d_boxsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d_boxsup_boxlevelset/checkpoint0009.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder --supervision box \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file a2d \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_test_a2d_resnet101_pointsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | PORT=25505
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_a2d_pointsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d_pointsup_partialsup/checkpoint0001.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder --supervision point \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file a2d \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_test_jhmdb_resnet101_boxsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | PORT=25509
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_jhmdb_boxsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d_boxsup_boxlevelset/checkpoint0010.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder --supervision box \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file jhmdb \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_test_jhmdb_resnet101_pointsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='1'
 5 | PORT=25505
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_scratch_jhmdb_pointsup"
16 | EXP_NAME="SgMg_${BACKBONE}_scratch_a2d"
17 | RESUME="results/SgMg_resnet101_scratch_a2d_pointsup_partialsup/checkpoint0001.pth"
18 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
19 |   --with_box_refine --binary --freeze_text_encoder --supervision point \
20 |   --exp_name=${EXP_NAME} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --backbone=${BACKBONE} \
23 |   --dataset_file jhmdb \
24 |   --batch_size 4 \
25 |   --epochs 12 --lr_drop 3 5 \
26 |   --eval \
27 |   --resume=${RESUME}
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/dist_test_a2d_videoswint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | PORT=25503
 6 | GPUS_PER_NODE=1
 7 | CPUS_PER_TASK=6
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="video_swin_t_p4w7"
15 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
16 | OUTPUT_DIR="./results/results/SgMg_${BACKBONE}_finetune_a2d"
17 | EXP_NAME="SgMg_${BACKBONE}_finetune_a2d"
18 | PRETRAINED_WEIGHTS="checkpoints/sgmg_videoswint_a2d.pth"
19 | TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
20 |   --with_box_refine --binary --freeze_text_encoder \
21 |   --exp_name=${EXP_NAME} \
22 |   --output_dir=${OUTPUT_DIR} \
23 |   --backbone=${BACKBONE} \
24 |   --dataset_file a2d \
25 |   --batch_size 2 \
26 |   --epochs 6 --lr_drop 3 5 \
27 |   --pretrained_weights=${PRETRAINED_WEIGHTS} \
28 |   --eval \
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/scripts/dist_train_a2d_videoswinb.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | PORT=25503
 7 | GPUS_PER_NODE=2
 8 | CPUS_PER_TASK=6
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_b_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_finetune_a2d"
18 | EXP_NAME="SgMg_${BACKBONE}_finetune_a2d"
19 | PRETRAINED_WEIGHTS="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
20 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
21 |   --with_box_refine --binary --freeze_text_encoder \
22 |   --exp_name=${EXP_NAME} \
23 |   --output_dir=${OUTPUT_DIR} \
24 |   --backbone=${BACKBONE} \
25 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
26 |   --dataset_file a2d \
27 |   --batch_size 2 \
28 |   --epochs 6 --lr_drop 3 5 \
29 |   --pretrained_weights=${PRETRAINED_WEIGHTS} \
30 |   --use_checkpoint \
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/davis2017/results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from PIL import Image
 4 | import sys
 5 | 
 6 | 
 7 | class Results(object):
 8 |     def __init__(self, root_dir):
 9 |         self.root_dir = root_dir
10 | 
11 |     def _read_mask(self, sequence, frame_id):
12 |         try:
13 |             mask_path = os.path.join(self.root_dir, sequence, f'{frame_id}.png')
14 |             return np.array(Image.open(mask_path))
15 |         except IOError as err:
16 |             sys.stdout.write(sequence + " frame %s not found!\n" % frame_id)
17 |             sys.stdout.write("The frames have to be indexed PNG files placed inside the corespondent sequence "
18 |                              "folder.\nThe indexes have to match with the initial frame.\n")
19 |             sys.stderr.write("IOError: " + err.strerror + "\n")
20 |             sys.exit()
21 | 
22 |     def read_masks(self, sequence, masks_id):
23 |         mask_0 = self._read_mask(sequence, masks_id[0])
24 |         masks = np.zeros((len(masks_id), *mask_0.shape))
25 |         for ii, m in enumerate(masks_id):
26 |             masks[ii, ...] = self._read_mask(sequence, m)
27 |         num_objects = int(np.max(masks))
28 |         tmp = np.ones((num_objects, *masks.shape))
29 |         tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
30 |         masks = (tmp == masks[None, ...]) > 0
31 |         return masks
32 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/scripts/dist_train_ytvos_videoswin.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | PORT=25500
 7 | GPUS_PER_NODE=2
 8 | CPUS_PER_TASK=6
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_t_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_tiny_patch244_window877_kinetics400_1k.pth"
17 | OUTPUT_DIR1="./checkpoints/results/SgMg_${BACKBONE}_pretrain"
18 | EXP_NAME1="SgMg_${BACKBONE}_pretrain"
19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main_pretrain.py \
20 |   --dataset_file all \
21 |   --with_box_refine --binary \
22 |   --output_dir=${OUTPUT_DIR1} \
23 |   --exp_name=${EXP_NAME1} \
24 |   --backbone=${BACKBONE} \
25 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
26 |   --batch_size 2 \
27 |   --num_frames 1 \
28 |   --epochs 11 --lr_drop 8 10 \
29 | 
30 | 
31 | OUTPUT_DIR2="./checkpoints/results/SgMg_${BACKBONE}_finetune"
32 | EXP_NAME2="SgMg_${BACKBONE}_finetune"
33 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
34 |   --with_box_refine --binary --freeze_text_encoder \
35 |   --output_dir=${OUTPUT_DIR2} \
36 |   --exp_name=${EXP_NAME2} \
37 |   --backbone=${BACKBONE} \
38 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
39 |   --epochs 6 --lr_drop 3 5 \
40 |   --dataset_file ytvos \
41 |   --pretrained_weights ${OUTPUT_DIR1}"/checkpoint0010.pth" \
42 | 


--------------------------------------------------------------------------------
/scripts/dist_train_ytvos_videoswinb.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | PORT=25501
 7 | GPUS_PER_NODE=2
 8 | CPUS_PER_TASK=6
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_b_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
17 | OUTPUT_DIR1="./checkpoints/results/SgMg_${BACKBONE}_pretrain"
18 | EXP_NAME1="SgMg_${BACKBONE}_pretrain"
19 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main_pretrain.py \
20 |   --dataset_file all \
21 |   --with_box_refine --binary \
22 |   --output_dir=${OUTPUT_DIR1} \
23 |   --exp_name=${EXP_NAME1} \
24 |   --backbone=${BACKBONE} \
25 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
26 |   --batch_size 2 \
27 |   --num_frames 1 \
28 |   --epochs 11 --lr_drop 8 10 \
29 | 
30 | 
31 | OUTPUT_DIR2="./checkpoints/results/SgMg_${BACKBONE}_finetune"
32 | EXP_NAME2="SgMg_${BACKBONE}_finetune"
33 | CUDA_VISIBLE_DEVICES=${GPUS} OMP_NUM_THREADS=${CPUS_PER_TASK} torchrun --master_port ${PORT}  --nproc_per_node=${GPUS_PER_NODE} main.py \
34 |   --with_box_refine --binary --freeze_text_encoder \
35 |   --output_dir=${OUTPUT_DIR2} \
36 |   --exp_name=${EXP_NAME2} \
37 |   --backbone=${BACKBONE} \
38 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
39 |   --epochs 6 --lr_drop 3 5 \
40 |   --dataset_file ytvos \
41 |   --pretrained_weights ${OUTPUT_DIR1}"/checkpoint0010.pth" \
42 | 
43 | 


--------------------------------------------------------------------------------
/pre_process/dense_crf.py:
--------------------------------------------------------------------------------
 1 | import pydensecrf.densecrf as dcrf
 2 | import numpy as np
 3 | from pydensecrf.utils import unary_from_labels
 4 | 
 5 | 
 6 | def sigmoid(x):
 7 |     return 1 / (1 + np.exp(-x))
 8 | 
 9 | 
10 | def apply_dense_crf(img, mask):
11 |     EPSILON = 1e-8
12 |     M = 2  # salient or not
13 |     tau = 1.05
14 |     # Setup the CRF model
15 |     d = dcrf.DenseCRF2D(img.shape[1], img.shape[0], M)
16 |     anno_norm = mask / 255.0
17 |     n_energy = -np.log((1.0 - anno_norm + EPSILON)) / (tau * sigmoid(1 - anno_norm))
18 |     p_energy = -np.log(anno_norm + EPSILON) / (tau * sigmoid(anno_norm))
19 | 
20 |     U = np.zeros((M, img.shape[0] * img.shape[1]), dtype="float32")
21 |     U[0, :] = n_energy.flatten()
22 |     U[1, :] = p_energy.flatten()
23 | 
24 |     d.setUnaryEnergy(U)
25 | 
26 |     d.addPairwiseGaussian(sxy=3, compat=3)
27 |     # d.addPairwiseBilateral(sxy=60, srgb=5, rgbim=img, compat=5)
28 |     d.addPairwiseBilateral(sxy=(80, 80), srgb=(13, 13, 13), rgbim=img, compat=10)
29 | 
30 |     # Do the inference
31 |     infer = np.array(d.inference(4)).astype("float32")
32 |     res = infer[1, :]
33 | 
34 |     res = res * 255
35 |     res = res.reshape(img.shape[:2]).astype("uint8")
36 |     return res
37 | 
38 | 
39 | def crf_inference_label(img, labels, t=10, n_labels=21, gt_prob=0.7):
40 |     h, w = img.shape[:2]
41 | 
42 |     d = dcrf.DenseCRF2D(w, h, n_labels)
43 | 
44 |     unary = unary_from_labels(labels, n_labels, gt_prob=gt_prob, zero_unsure=False)
45 | 
46 |     d.setUnaryEnergy(unary)
47 |     d.addPairwiseGaussian(sxy=3, compat=3)
48 |     d.addPairwiseBilateral(
49 |         sxy=50, srgb=5, rgbim=np.ascontiguousarray(np.copy(img)), compat=10
50 |     )
51 | 
52 |     q = d.inference(t)
53 | 
54 |     return np.argmax(np.array(q).reshape((n_labels, h, w)), axis=0)
55 | 


--------------------------------------------------------------------------------
/scripts/dist_test_davis_resnet.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | GPUS='0'
 5 | GPUS_PER_NODE=1
 6 | CPUS_PER_TASK=6
 7 | PORT=29500
 8 | export CUDA_VISIBLE_DEVICES=${GPUS}
 9 | echo "using gpus ${GPUS}, master port ${PORT}."
10 | now=$(date +"%T")
11 | echo "Current time : $now"
12 | echo "Current path : $PWD"
13 | 
14 | BACKBONE="resnet101"
15 | # BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
16 | OUTPUT_DIR="./results/SgMg_${BACKBONE}_eval"
17 | CHECKPOINT="./results/SgMg_resnet101_scratch_ytvos_boxsup/checkpoint0009.pth"
18 | python inference_davis.py --with_box_refine --binary --freeze_text_encoder \
19 |   --eval \
20 |   --ngpu=${GPUS_PER_NODE} \
21 |   --output_dir=${OUTPUT_DIR} \
22 |   --resume=${CHECKPOINT} \
23 |   --backbone=${BACKBONE} \
24 |   --backbone_pretrained=${BACKBONE_PRETRAINED}
25 | 
26 | 
27 | # evaluation
28 | ANNO0_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_0"
29 | ANNO1_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_1"
30 | ANNO2_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_2"
31 | ANNO3_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_3"
32 | echo "Annotations store at : ${ANNO0_DIR}"
33 | rm ${ANNO0_DIR}"/global_results-val.csv"
34 | rm ${ANNO0_DIR}"/per-sequence_results-val.csv"
35 | rm ${ANNO1_DIR}"/global_results-val.csv"
36 | rm ${ANNO1_DIR}"/per-sequence_results-val.csv"
37 | rm ${ANNO2_DIR}"/global_results-val.csv"
38 | rm ${ANNO2_DIR}"/per-sequence_results-val.csv"
39 | rm ${ANNO3_DIR}"/global_results-val.csv"
40 | rm ${ANNO3_DIR}"/per-sequence_results-val.csv"
41 | 
42 | python3 eval_davis.py --results_path=${ANNO0_DIR}
43 | python3 eval_davis.py --results_path=${ANNO1_DIR}
44 | python3 eval_davis.py --results_path=${ANNO2_DIR}
45 | python3 eval_davis.py --results_path=${ANNO3_DIR}
46 | 
47 | echo "Working path is: ${OUTPUT_DIR}"
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/dist_test_davis_videoswinb.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | cd ..
 4 | 
 5 | GPUS='0,1'
 6 | GPUS_PER_NODE=2
 7 | CPUS_PER_TASK=6
 8 | PORT=29500
 9 | export CUDA_VISIBLE_DEVICES=${GPUS}
10 | echo "using gpus ${GPUS}, master port ${PORT}."
11 | now=$(date +"%T")
12 | echo "Current time : $now"
13 | echo "Current path : $PWD"
14 | 
15 | BACKBONE="video_swin_b_p4w7"
16 | BACKBONE_PRETRAINED="./checkpoints/backbones/swin_base_patch244_window877_kinetics600_22k.pth"
17 | OUTPUT_DIR="./checkpoints/results/SgMg_${BACKBONE}_eval"
18 | CHECKPOINT="./checkpoints/sgmg_videosiwnb_ytvos.pth"
19 | python inference_davis.py --with_box_refine --binary --freeze_text_encoder \
20 |   --eval \
21 |   --ngpu=${GPUS_PER_NODE} \
22 |   --output_dir=${OUTPUT_DIR} \
23 |   --resume=${CHECKPOINT} \
24 |   --backbone=${BACKBONE} \
25 |   --backbone_pretrained=${BACKBONE_PRETRAINED} \
26 |   --amp \
27 | 
28 | 
29 | # evaluation
30 | ANNO0_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_0"
31 | ANNO1_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_1"
32 | ANNO2_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_2"
33 | ANNO3_DIR=${OUTPUT_DIR}/"DVS_Annotations"/"anno_3"
34 | echo "Annotations store at : ${ANNO0_DIR}"
35 | rm ${ANNO0_DIR}"/global_results-val.csv"
36 | rm ${ANNO0_DIR}"/per-sequence_results-val.csv"
37 | rm ${ANNO1_DIR}"/global_results-val.csv"
38 | rm ${ANNO1_DIR}"/per-sequence_results-val.csv"
39 | rm ${ANNO2_DIR}"/global_results-val.csv"
40 | rm ${ANNO2_DIR}"/per-sequence_results-val.csv"
41 | rm ${ANNO3_DIR}"/global_results-val.csv"
42 | rm ${ANNO3_DIR}"/per-sequence_results-val.csv"
43 | 
44 | python3 eval_davis.py --results_path=${ANNO0_DIR}
45 | python3 eval_davis.py --results_path=${ANNO1_DIR}
46 | python3 eval_davis.py --results_path=${ANNO2_DIR}
47 | python3 eval_davis.py --results_path=${ANNO3_DIR}
48 | 
49 | echo "Working path is: ${OUTPUT_DIR}"
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/pre_process/frozen_batchnorm2d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class FrozenBatchNorm2d(torch.nn.Module):
 5 |     """
 6 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 7 | 
 8 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 9 |     without which any other models than torchvision.models.resnet[18,34,50,101]
10 |     produce nans.
11 |     """
12 | 
13 |     def __init__(self, n):
14 |         super(FrozenBatchNorm2d, self).__init__()
15 |         self.register_buffer("weight", torch.ones(n))
16 |         self.register_buffer("bias", torch.zeros(n))
17 |         self.register_buffer("running_mean", torch.zeros(n))
18 |         self.register_buffer("running_var", torch.ones(n))
19 | 
20 |     def _load_from_state_dict(
21 |         self,
22 |         state_dict,
23 |         prefix,
24 |         local_metadata,
25 |         strict,
26 |         missing_keys,
27 |         unexpected_keys,
28 |         error_msgs,
29 |     ):
30 |         num_batches_tracked_key = prefix + "num_batches_tracked"
31 |         if num_batches_tracked_key in state_dict:
32 |             del state_dict[num_batches_tracked_key]
33 | 
34 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
35 |             state_dict,
36 |             prefix,
37 |             local_metadata,
38 |             strict,
39 |             missing_keys,
40 |             unexpected_keys,
41 |             error_msgs,
42 |         )
43 | 
44 |     def forward(self, x):
45 |         # move reshapes to the beginning
46 |         # to make it fuser-friendly
47 |         w = self.weight.reshape(1, -1, 1, 1)
48 |         b = self.bias.reshape(1, -1, 1, 1)
49 |         rv = self.running_var.reshape(1, -1, 1, 1)
50 |         rm = self.running_mean.reshape(1, -1, 1, 1)
51 |         eps = 1e-5
52 |         scale = w * (rv + eps).rsqrt()
53 |         bias = b - rm * scale
54 |         return x * scale + bias
55 | 


--------------------------------------------------------------------------------
/models/decoder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import copy
 3 | from typing import Optional, List
 4 | import math
 5 | 
 6 | import torch
 7 | import torch.nn.functional as F
 8 | from torch import nn, Tensor
 9 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
10 | from util.misc import inverse_sigmoid
11 | from einops import rearrange
12 | 
13 | 
14 | class MSO(nn.Module):
15 |     def __init__(self, mask_dim=16, img_dim=[96, 192], out_dim=16):
16 |         super().__init__()
17 | 
18 |         self.mask_dim = mask_dim
19 |         self.img_dim = img_dim
20 |         self.out_dim = out_dim
21 | 
22 |         self.conv1_1div8 = nn.Conv2d(mask_dim+img_dim[1], mask_dim, kernel_size=3, padding=1)
23 |         self.conv2_1div8 = nn.Conv2d(mask_dim, mask_dim, kernel_size=3, padding=1)
24 | 
25 |         self.conv1_1div4 = nn.Conv2d(mask_dim + img_dim[0], mask_dim, kernel_size=3, padding=1)
26 |         self.conv2_1div4 = nn.Conv2d(mask_dim, mask_dim, kernel_size=3, padding=1)
27 |         self.out_conv = nn.Conv2d(mask_dim, 1, kernel_size=3, padding=1)
28 | 
29 | 
30 |     # TODO: add image on channel.  deconv to upsample
31 |     def forward(self, pred_masks, image_features):
32 |         image_features = [x.tensors for x in image_features]  # 1/4 & 1/8
33 | 
34 |         # merge with 1/8 image
35 |         assert pred_masks.shape[-1] == image_features[-1].shape[-1], "First size wrong."
36 |         x = torch.cat([pred_masks, image_features[-1]], dim=1)
37 |         pred_masks += self.conv2_1div8(F.relu(self.conv1_1div8(F.relu(x))))
38 | 
39 |         # merge with 1/4 image
40 |         pred_masks = F.interpolate(pred_masks, size=(image_features[-2].shape[-2], image_features[-2].shape[-1]), mode='bilinear', align_corners=False)
41 |         assert pred_masks.shape[-1] == image_features[-2].shape[-1], "Second size wrong."
42 |         x = torch.cat([pred_masks, image_features[-2]], dim=1)
43 |         pred_masks += self.conv2_1div4(F.relu(self.conv1_1div4(F.relu(x))))
44 | 
45 |         pred_masks = self.out_conv(pred_masks)
46 | 
47 |         return pred_masks
48 | 
49 | 


--------------------------------------------------------------------------------
/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | import torch
 9 | import torch.utils.data
10 | 
11 | from torch.utils.data import Dataset, ConcatDataset
12 | from .refexp2seq import build as build_seq_refexp
13 | from .ytvos import build as build_ytvs
14 | from .davis import build as build_davis
15 | from datasets import ytvos
16 | 
17 | 
18 | # join ref coco and ytvos
19 | def build(image_set, args):
20 |     concat_data = []
21 | 
22 |     print('preparing coco2seq dataset ....')
23 |     coco_names = ["refcoco", "refcoco+", "refcocog"]
24 |     for name in coco_names:
25 |         coco_seq = build_seq_refexp(name, image_set, args)
26 |         concat_data.append(coco_seq)
27 | 
28 |     print('preparing ytvos dataset  .... ')
29 |     ytvos_dataset = build_ytvs(image_set, args)
30 |     concat_data.append(ytvos_dataset)
31 | 
32 |     concat_data = ConcatDataset(concat_data)
33 | 
34 |     return concat_data
35 | 
36 | def build_coco(image_set, args):
37 |     concat_data = []
38 | 
39 |     print('preparing coco2seq dataset ....')
40 |     coco_names = ["refcoco", "refcoco+", "refcocog"]
41 |     for name in coco_names:
42 |         coco_seq = build_seq_refexp(name, image_set, args)
43 |         concat_data.append(coco_seq)
44 | 
45 |     concat_data = ConcatDataset(concat_data)
46 |     return concat_data
47 | 
48 | def build_joint_ytb_dvs(image_set, args):
49 |     concat_data = []
50 | 
51 |     print('preparing davis dataset ....')
52 |     dvs_dataset = build_davis(image_set, args)
53 |     for i in range(5):
54 |         concat_data.append(dvs_dataset)
55 | 
56 |     print('preparing ytvos dataset  .... ')
57 |     ytvos_dataset = build_ytvs(image_set, args)
58 |     concat_data.append(ytvos_dataset)
59 | 
60 |     concat_data = ConcatDataset(concat_data)
61 | 
62 |     return concat_data
63 | 


--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/pre_process/data.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os
 3 | from PIL import Image
 4 | from torchvision import transforms
 5 | import numpy as np
 6 | 
 7 | 
 8 | def img_transform(img, annos):
 9 |     transform = transforms.Compose(
10 |         [
11 |             transforms.ToTensor(),
12 |             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
13 |         ]
14 |     )
15 |     h, w, _ = img.shape
16 |     centers = []
17 |     centers_norm = []
18 |     label_list = list(np.unique(annos))
19 |     label_list.remove(0)
20 |     for label in label_list:
21 |         anno = (annos == label).astype(np.uint8) * 255
22 |         dist = cv2.distanceTransform(anno, cv2.DIST_L2, 5, cv2.DIST_LABEL_PIXEL)
23 |         _, _, _, center = cv2.minMaxLoc(dist)
24 |         center_norm = (center[0] / w, center[1] / h)
25 |         centers.append(center)
26 |         centers_norm.append(center_norm)
27 |     img = Image.fromarray(img)
28 |     img = transform(img)
29 |     return img, centers, centers_norm
30 | 
31 | 
32 | def load_img_davis(img_path, anno_path):
33 |     imgs = os.listdir(img_path)
34 |     out_pairs = {}
35 |     for img in imgs:
36 |         out_pairs[img.split(".")[0]] = {}
37 |         frame = cv2.imread(os.path.join(img_path, img))
38 |         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
39 |         out_pairs[img.split(".")[0]]["frame"] = frame
40 |         anno = cv2.imread(
41 |             os.path.join(
42 |                 anno_path,
43 |                 img.split(".")[0] + ".png",
44 |             )
45 |         )
46 |         out_pairs[img.split(".")[0]]["label"] = anno
47 |     return out_pairs
48 | 
49 | 
50 | def load_video_a2d(video_path, anno_path):
51 |     annos = os.listdir(anno_path)
52 |     out_pairs = {}
53 | 
54 |     for anno in annos:
55 |         out_pairs[str(int(anno.split(".")[0]))] = {}
56 |         ann_img = cv2.imread(os.path.join(anno_path, anno))
57 |         out_pairs[str(int(anno.split(".")[0]))]["label"] = ann_img
58 | 
59 |     cap = cv2.VideoCapture(video_path)
60 |     idx = 0
61 |     while True:
62 |         ret = cap.grab()
63 |         if not ret:
64 |             break
65 |         if str(idx + 1) in out_pairs.keys():
66 |             ret, frame = cap.retrieve()
67 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
68 |             out_pairs[str(idx + 1)]["frame"] = frame
69 |         idx += 1
70 |     cap.release()
71 |     return out_pairs
72 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data
 2 | import torchvision
 3 | 
 4 | from .ytvos import build as build_ytvos
 5 | from .davis import build as build_davis
 6 | from .a2d import build as build_a2d
 7 | from .jhmdb import build as build_jhmdb
 8 | from .refexp import build as build_refexp
 9 | from .concat_dataset import build as build_joint
10 | from .concat_dataset import build_coco as build_joint_coco
11 | from .concat_dataset import build_joint_ytb_dvs
12 | 
13 | def get_coco_api_from_dataset(dataset):
14 |     for _ in range(10):
15 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
16 |         #     break
17 |         if isinstance(dataset, torch.utils.data.Subset):
18 |             dataset = dataset.dataset
19 |     if isinstance(dataset, torchvision.datasets.CocoDetection):
20 |         return dataset.coco
21 | 
22 | 
23 | def build_dataset(dataset_file: str, image_set: str, args):
24 |     if dataset_file == 'ytvos':
25 |         print("\n **** Start to build dataset {}. **** \n".format("build_ytvos"))
26 |         return build_ytvos(image_set, args)
27 |     if dataset_file == 'davis':
28 |         print("\n **** Start to build dataset {}. **** \n".format("build_davis"))
29 |         return build_davis(image_set, args)
30 |     if dataset_file == 'a2d':
31 |         print("\n **** Start to build dataset {}. **** \n".format("build_a2d"))
32 |         return build_a2d(image_set, args)
33 |     if dataset_file == 'jhmdb':
34 |         print("\n **** Start to build dataset {}. **** \n".format("build_jhmdb"))
35 |         return build_jhmdb(image_set, args)
36 |     # for pretraining
37 |     if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
38 |         print("\n **** Start to build dataset {}. **** \n".format("build_refexp"))
39 |         return build_refexp(dataset_file, image_set, args)
40 | 
41 |     # for joint training of refcoco and ytvos, not used.
42 |     if dataset_file == 'joint':
43 |         print("\n **** Start to build dataset {}. **** \n".format("build_joint"))
44 |         return build_joint(image_set, args)
45 |     if dataset_file == 'joint_coco':
46 |         print("\n **** Start to build dataset {}. **** \n".format("build_joint_coco"))
47 |         return build_joint_coco(image_set, args)
48 |     if dataset_file == 'ytvos_joint_davis':
49 |         print("\n **** Start to build dataset {}. **** \n".format("build_joint_ytb_dvs"))
50 |         return build_joint_ytb_dvs(image_set, args)
51 |     raise ValueError(f'dataset {dataset_file} not supported')
52 | 


--------------------------------------------------------------------------------
/models/modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from typing import Optional, List
 5 | from torch import Tensor
 6 | from einops import rearrange
 7 | 
 8 | 
 9 | class LFMResizeAdaptive(nn.Module):
10 |     def __init__(self, num_channels, sigma):
11 |         super(LFMResizeAdaptive, self).__init__()
12 |         self.conv1 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
13 |         self.conv2 = nn.Conv2d(2 * num_channels, 2 * num_channels, kernel_size=1, stride=1, padding=0)
14 |         self.sigma = sigma
15 | 
16 |         self.laplace = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=0)
17 |         self.pool = nn.AdaptiveAvgPool2d(1)
18 |         self.fc = nn.Sequential(
19 |             nn.Linear(num_channels, num_channels, bias=False),
20 |             nn.ReLU(inplace=True),
21 |             nn.Linear(num_channels, 1, bias=False),
22 |             nn.Sigmoid()
23 |         )
24 | 
25 |     def make_gaussian(self, y_idx, x_idx, height, width, sigma=7, device='cpu'):
26 |         yv, xv = torch.meshgrid([torch.arange(0, height), torch.arange(0, width)])
27 | 
28 |         yv = yv.unsqueeze(0).float().to(device)
29 |         xv = xv.unsqueeze(0).float().to(device)
30 |         g = torch.exp(- ((yv - y_idx) ** 2 + (xv - x_idx) ** 2) / (2 * sigma ** 2))
31 |         return g.unsqueeze(0)
32 | 
33 |     def forward(self, x, gauss_map=None):
34 |         b, c, h, w = x.shape
35 |         x = x.float()
36 | 
37 |         # compute coef for gaussian 0~1
38 |         coef = self.laplace(x)
39 |         coef = self.fc(self.pool(coef).view(b, c)).view(b, 1, 1, 1)
40 | 
41 |         y = torch.fft.fft2(x)
42 | 
43 |         h_idx, w_idx = h // 2, w // 2
44 |         if gauss_map is None:
45 |             high_filter = self.make_gaussian(h_idx, w_idx, h, w,  self.sigma, device=x.device)
46 |         else:
47 |             high_filter = F.interpolate(gauss_map, size=(h, w), mode='bilinear', align_corners=False)
48 | 
49 |         y = y * (1 - coef * high_filter)
50 | 
51 |         y_imag = y.imag
52 |         y_real = y.real
53 |         y_f = torch.cat([y_real, y_imag], dim=1)
54 |         y = F.relu(self.conv1(y_f))
55 | 
56 |         y = self.conv2(y).float()
57 |         y_real, y_imag = torch.chunk(y, 2, dim=1)
58 |         y = torch.complex(y_real, y_imag)
59 | 
60 |         y = torch.fft.ifft2(y, s=(h, w)).float()
61 |         return x + y, high_filter
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     model = LFMResizeAdaptive(256, 3)
66 |     data = torch.rand(2,256,8,8)
67 |     res = model(data)
68 | 


--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-38/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/datasets/refexp_eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
 2 | import copy
 3 | from collections import defaultdict
 4 | from pathlib import Path
 5 | 
 6 | import torch
 7 | import torch.utils.data
 8 | 
 9 | import util.misc as utils
10 | from util.box_ops import generalized_box_iou
11 | 
12 | 
13 | class RefExpEvaluator(object):
14 |     def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5):
15 |         assert isinstance(k, (list, tuple))
16 |         refexp_gt = copy.deepcopy(refexp_gt)
17 |         self.refexp_gt = refexp_gt
18 |         self.iou_types = iou_types
19 |         self.img_ids = self.refexp_gt.imgs.keys()
20 |         self.predictions = {}
21 |         self.k = k
22 |         self.thresh_iou = thresh_iou
23 | 
24 |     def accumulate(self):
25 |         pass
26 | 
27 |     def update(self, predictions):
28 |         self.predictions.update(predictions)
29 | 
30 |     def synchronize_between_processes(self):
31 |         all_predictions = utils.all_gather(self.predictions)
32 |         merged_predictions = {}
33 |         for p in all_predictions:
34 |             merged_predictions.update(p)
35 |         self.predictions = merged_predictions
36 | 
37 |     def summarize(self):
38 |         if utils.is_main_process():
39 |             dataset2score = {
40 |                 "refcoco": {k: 0.0 for k in self.k},
41 |                 "refcoco+": {k: 0.0 for k in self.k},
42 |                 "refcocog": {k: 0.0 for k in self.k},
43 |             }
44 |             dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0}
45 |             for image_id in self.img_ids:
46 |                 ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id)
47 |                 assert len(ann_ids) == 1
48 |                 img_info = self.refexp_gt.loadImgs(image_id)[0]
49 | 
50 |                 target = self.refexp_gt.loadAnns(ann_ids[0])
51 |                 prediction = self.predictions[image_id]
52 |                 assert prediction is not None
53 |                 sorted_scores_boxes = sorted(
54 |                     zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True
55 |                 )
56 |                 sorted_scores, sorted_boxes = zip(*sorted_scores_boxes)
57 |                 sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes])
58 |                 target_bbox = target[0]["bbox"]
59 |                 converted_bbox = [
60 |                     target_bbox[0],
61 |                     target_bbox[1],
62 |                     target_bbox[2] + target_bbox[0],
63 |                     target_bbox[3] + target_bbox[1],
64 |                 ]
65 |                 giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4))
66 |                 for k in self.k:
67 |                     if max(giou[:k]) >= self.thresh_iou:
68 |                         dataset2score[img_info["dataset_name"]][k] += 1.0
69 |                 dataset2count[img_info["dataset_name"]] += 1.0
70 | 
71 |             for key, value in dataset2score.items():
72 |                 for k in self.k:
73 |                     try:
74 |                         value[k] /= dataset2count[key]
75 |                     except:
76 |                         pass
77 |             results = {}
78 |             for key, value in dataset2score.items():
79 |                 results[key] = sorted([v for k, v in value.items()])
80 |                 print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n")
81 | 
82 |             return results
83 |         return None
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/eval_davis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | from time import time
 5 | import argparse
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from davis2017.evaluation import DAVISEvaluation
10 | 
11 | default_davis_path = '../datasets/refer_davis/valid'
12 | 
13 | time_start = time()
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--davis_path', type=str, help='Path to the DAVIS folder containing the JPEGImages, Annotations, '
16 |                                                    'ImageSets, Annotations_unsupervised folders',
17 |                     required=False, default=default_davis_path)
18 | parser.add_argument('--set', type=str, help='Subset to evaluate the results', default='val') # val subset
19 | parser.add_argument('--task', type=str, help='Task to evaluate the results', default='unsupervised',
20 |                     choices=['semi-supervised', 'unsupervised'])
21 | parser.add_argument('--results_path', type=str, help='Path to the folder containing the sequences folders',
22 |                     required=True)
23 | args, _ = parser.parse_known_args()
24 | csv_name_global = f'global_results-{args.set}.csv'
25 | csv_name_per_sequence = f'per-sequence_results-{args.set}.csv'
26 | 
27 | # Check if the method has been evaluated before, if so read the results, otherwise compute the results
28 | csv_name_global_path = os.path.join(args.results_path, csv_name_global)
29 | csv_name_per_sequence_path = os.path.join(args.results_path, csv_name_per_sequence)
30 | if os.path.exists(csv_name_global_path) and os.path.exists(csv_name_per_sequence_path):
31 |     print('Using precomputed results...')
32 |     table_g = pd.read_csv(csv_name_global_path)
33 |     table_seq = pd.read_csv(csv_name_per_sequence_path)
34 | else:
35 |     print(f'Evaluating sequences for the {args.task} task...')
36 |     # Create dataset and evaluate
37 |     dataset_eval = DAVISEvaluation(davis_root=args.davis_path, task=args.task, gt_set=args.set)
38 |     metrics_res = dataset_eval.evaluate(args.results_path)
39 |     J, F = metrics_res['J'], metrics_res['F']
40 | 
41 |     # Generate dataframe for the general results
42 |     g_measures = ['J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay']
43 |     final_mean = (np.mean(J["M"]) + np.mean(F["M"])) / 2.
44 |     g_res = np.array([final_mean, np.mean(J["M"]), np.mean(J["R"]), np.mean(J["D"]), np.mean(F["M"]), np.mean(F["R"]),
45 |                       np.mean(F["D"])])
46 |     g_res = np.reshape(g_res, [1, len(g_res)])
47 |     table_g = pd.DataFrame(data=g_res, columns=g_measures)
48 |     with open(csv_name_global_path, 'w') as f:
49 |         table_g.to_csv(f, index=False, float_format="%.5f")
50 |     print(f'Global results saved in {csv_name_global_path}')
51 | 
52 |     # Generate a dataframe for the per sequence results
53 |     seq_names = list(J['M_per_object'].keys())
54 |     seq_measures = ['Sequence', 'J-Mean', 'F-Mean']
55 |     J_per_object = [J['M_per_object'][x] for x in seq_names]
56 |     F_per_object = [F['M_per_object'][x] for x in seq_names]
57 |     table_seq = pd.DataFrame(data=list(zip(seq_names, J_per_object, F_per_object)), columns=seq_measures)
58 |     with open(csv_name_per_sequence_path, 'w') as f:
59 |         table_seq.to_csv(f, index=False, float_format="%.5f")
60 |     print(f'Per-sequence results saved in {csv_name_per_sequence_path}')
61 | 
62 | # Print the results
63 | sys.stdout.write(f"--------------------------- Global results for {args.set} ---------------------------\n")
64 | print(table_g.to_string(index=False))
65 | sys.stdout.write(f"\n---------- Per sequence results for {args.set} ----------\n")
66 | print(table_seq.to_string(index=False))
67 | total_time = time() - time_start
68 | sys.stdout.write('\nTotal time:' + str(total_time))
69 | 


--------------------------------------------------------------------------------
/models/text_encoder/text_encoder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains a wrapper for Video-Swin-Transformer so it can be properly used as a temporal encoder for MTTR.
 3 | """
 4 | import torch
 5 | import os
 6 | from torch import nn, Tensor
 7 | from einops import rearrange, repeat
 8 | 
 9 | from transformers import RobertaModel, RobertaTokenizerFast
10 | from models.text_encoder.tokenizer import RobertaTokenizer
11 | 
12 | import warnings
13 | warnings.filterwarnings("ignore")
14 | 
15 | 
16 | class FeatureResizer(nn.Module):
17 |     def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
18 |         super().__init__()
19 |         self.do_ln = do_ln
20 |         self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
21 |         self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
22 |         self.dropout = nn.Dropout(dropout)
23 | 
24 |     def forward(self, encoder_features):
25 |         x = self.fc(encoder_features)
26 |         if self.do_ln:
27 |             x = self.layer_norm(x)
28 |         output = self.dropout(x)
29 |         return output
30 | 
31 | 
32 | class TextEncoder(nn.Module):
33 |     def __init__(self, args):
34 |         super(TextEncoder, self).__init__()
35 |         self.args = args
36 |         self.hidden_dim = args.hidden_dim
37 |         self.text_backbone_name = args.text_backbone
38 |         self.token_size = 32
39 |         if self.text_backbone_name == "Roberta":
40 |             # self.text_backbone = RobertaModel.from_pretrained("roberta-base")
41 |             # self.text_backbone.pooler = None  # this pooler is never used, this is a hack to avoid DDP problems...
42 |             self.tokenizer = RobertaTokenizer()
43 |             self.text_backbone = RobertaModel.from_pretrained("checkpoints/roberta-base")
44 |             self.feat_dim = 768
45 |         else:
46 |             assert False, f'error: Text Encoder "{self.text_backbone_name}" is not supported'
47 | 
48 |         self.freeze_text_encoder = args.freeze_text_encoder
49 |         if self.freeze_text_encoder:
50 |             # self.text_backbone.eval()
51 |             for p in self.text_backbone.parameters():
52 |                 p.requires_grad_(False)
53 |             for p in self.tokenizer.parameters():
54 |                 p.requires_grad_(False)
55 |         print("Use {} as text encoder. Freeze: {}".format(self.text_backbone_name, self.freeze_text_encoder))
56 | 
57 |         self.target_len = None
58 | 
59 |     def forward(self, texts, device):
60 |         if self.freeze_text_encoder:
61 |             with torch.no_grad():
62 |                 tokenized_queries = self.tokenizer(texts).to(device)
63 |                 if self.text_backbone_name == "Roberta":
64 |                     encoded_text = self.text_backbone(**tokenized_queries)
65 |                     text_pad_mask = tokenized_queries.attention_mask.ne(1).bool()
66 |                     text_features = encoded_text.last_hidden_state
67 |                     text_sentence_features = encoded_text.pooler_output
68 |                 else:
69 |                     raise NotImplementedError
70 |         else:
71 |             tokenized_queries = self.tokenizer(texts).to(device)
72 |             if self.text_backbone_name == "Roberta":
73 |                 encoded_text = self.text_backbone(**tokenized_queries)
74 |                 text_pad_mask = tokenized_queries.attention_mask.ne(1).bool()
75 |                 text_features = encoded_text.last_hidden_state
76 |                 text_sentence_features = encoded_text.pooler_output
77 |             else:
78 |                 raise NotImplementedError
79 | 
80 |         return text_features, text_sentence_features, text_pad_mask
81 | 
82 |     def num_parameters(self):
83 |         return sum(p.numel() for p in self.parameters() if p.requires_grad)
84 | 
85 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | 
  5 | def pre_trained_model_to_finetune(checkpoint, args):
  6 |     checkpoint = checkpoint['model']
  7 |     # only delete the class_embed since the finetuned dataset has different num_classes
  8 |     num_layers = args.dec_layers + 1 if args.two_stage else args.dec_layers
  9 |     for l in range(num_layers):
 10 |         del checkpoint["class_embed.{}.weight".format(l)]
 11 |         del checkpoint["class_embed.{}.bias".format(l)]
 12 |     
 13 |     return checkpoint
 14 | 
 15 | 
 16 | 
 17 | def colormap(rgb=False):
 18 |     color_list = np.array(
 19 |         [
 20 |             0.000, 0.447, 0.741,
 21 |             0.850, 0.325, 0.098,
 22 |             0.929, 0.694, 0.125,
 23 |             0.494, 0.184, 0.556,
 24 |             0.466, 0.674, 0.188,
 25 |             0.301, 0.745, 0.933,
 26 |             0.635, 0.078, 0.184,
 27 |             0.300, 0.300, 0.300,
 28 |             0.600, 0.600, 0.600,
 29 |             1.000, 0.000, 0.000,
 30 |             1.000, 0.500, 0.000,
 31 |             0.749, 0.749, 0.000,
 32 |             0.000, 1.000, 0.000,
 33 |             0.000, 0.000, 1.000,
 34 |             0.667, 0.000, 1.000,
 35 |             0.333, 0.333, 0.000,
 36 |             0.333, 0.667, 0.000,
 37 |             0.333, 1.000, 0.000,
 38 |             0.667, 0.333, 0.000,
 39 |             0.667, 0.667, 0.000,
 40 |             0.667, 1.000, 0.000,
 41 |             1.000, 0.333, 0.000,
 42 |             1.000, 0.667, 0.000,
 43 |             1.000, 1.000, 0.000,
 44 |             0.000, 0.333, 0.500,
 45 |             0.000, 0.667, 0.500,
 46 |             0.000, 1.000, 0.500,
 47 |             0.333, 0.000, 0.500,
 48 |             0.333, 0.333, 0.500,
 49 |             0.333, 0.667, 0.500,
 50 |             0.333, 1.000, 0.500,
 51 |             0.667, 0.000, 0.500,
 52 |             0.667, 0.333, 0.500,
 53 |             0.667, 0.667, 0.500,
 54 |             0.667, 1.000, 0.500,
 55 |             1.000, 0.000, 0.500,
 56 |             1.000, 0.333, 0.500,
 57 |             1.000, 0.667, 0.500,
 58 |             1.000, 1.000, 0.500,
 59 |             0.000, 0.333, 1.000,
 60 |             0.000, 0.667, 1.000,
 61 |             0.000, 1.000, 1.000,
 62 |             0.333, 0.000, 1.000,
 63 |             0.333, 0.333, 1.000,
 64 |             0.333, 0.667, 1.000,
 65 |             0.333, 1.000, 1.000,
 66 |             0.667, 0.000, 1.000,
 67 |             0.667, 0.333, 1.000,
 68 |             0.667, 0.667, 1.000,
 69 |             0.667, 1.000, 1.000,
 70 |             1.000, 0.000, 1.000,
 71 |             1.000, 0.333, 1.000,
 72 |             1.000, 0.667, 1.000,
 73 |             0.167, 0.000, 0.000,
 74 |             0.333, 0.000, 0.000,
 75 |             0.500, 0.000, 0.000,
 76 |             0.667, 0.000, 0.000,
 77 |             0.833, 0.000, 0.000,
 78 |             1.000, 0.000, 0.000,
 79 |             0.000, 0.167, 0.000,
 80 |             0.000, 0.333, 0.000,
 81 |             0.000, 0.500, 0.000,
 82 |             0.000, 0.667, 0.000,
 83 |             0.000, 0.833, 0.000,
 84 |             0.000, 1.000, 0.000,
 85 |             0.000, 0.000, 0.167,
 86 |             0.000, 0.000, 0.333,
 87 |             0.000, 0.000, 0.500,
 88 |             0.000, 0.000, 0.667,
 89 |             0.000, 0.000, 0.833,
 90 |             0.000, 0.000, 1.000,
 91 |             0.000, 0.000, 0.000,
 92 |             0.143, 0.143, 0.143,
 93 |             0.286, 0.286, 0.286,
 94 |             0.429, 0.429, 0.429,
 95 |             0.571, 0.571, 0.571,
 96 |             0.714, 0.714, 0.714,
 97 |             0.857, 0.857, 0.857,
 98 |             1.000, 1.000, 1.000
 99 |         ]
100 |     ).astype(np.float32)
101 |     color_list = color_list.reshape((-1, 3)) * 255
102 |     if not rgb:
103 |         color_list = color_list[:, ::-1]
104 |     return color_list


--------------------------------------------------------------------------------
/datasets/categories.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------------------------------------------
 2 | # 1. refer_youtube_vos
 3 | ytvos_category_dict = {
 4 |     'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 
 5 |     'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 
 6 |     'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 
 7 |     'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 
 8 |     'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 
 9 |     'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 
10 |     'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 
11 |     'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
12 | }
13 | 
14 | ytvos_category_list = [
15 |     'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
16 |     'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', 
17 |     'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', 
18 |     'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', 
19 |     'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', 
20 |     'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra'
21 | ]
22 | 
23 | # -------------------------------------------------------------------------------------------------------------------
24 | # 2. refer_davis17
25 | davis_category_dict = {
26 |     'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, 
27 |     'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, 
28 |     'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, 
29 |     'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, 
30 |     'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, 
31 |     'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, 
32 |     'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, 
33 |     'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, 
34 |     'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77
35 | }
36 | 
37 | davis_category_list = [
38 |     'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', 
39 |     'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', 
40 |     'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', 
41 |     'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', 
42 |     'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', 
43 |     'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', 
44 |     'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair'
45 | ]


--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/datasets/a2d_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics.
 3 | copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py
 4 | """
 5 | import torch
 6 | from tqdm import tqdm
 7 | from pycocotools.coco import COCO
 8 | from pycocotools.mask import decode
 9 | import numpy as np
10 | 
11 | from torchvision.ops.boxes import box_area
12 | 
13 | def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor):
14 |     # both boxes: xyxy
15 |     area1 = box_area(boxes1)
16 |     area2 = box_area(boxes2)
17 | 
18 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
19 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
20 | 
21 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
22 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
23 | 
24 |     union = area1[:, None] + area2 - inter
25 | 
26 |     iou = (inter+1e-6) / (union+1e-6)
27 |     return iou, inter, union
28 | 
29 | def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
30 |     outputs = outputs.int()
31 |     intersection = (outputs & labels).float().sum((1, 2))  # Will be zero if Truth=0 or Prediction=0
32 |     union = (outputs | labels).float().sum((1, 2))  # Will be zero if both are 0
33 |     iou = (intersection + EPS) / (union + EPS)  # EPS is used to avoid division by zero
34 |     return iou, intersection, union
35 | 
36 | # mask
37 | def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
38 |     print('evaluating mask precision@k & iou metrics...')
39 |     counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
40 |     total_intersection_area = 0
41 |     total_union_area = 0
42 |     ious_list = []
43 |     for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
44 |         gt_annot = coco_gt.imgToAnns[instance][0]
45 |         gt_mask = decode(gt_annot['segmentation'])
46 |         pred_annots = coco_pred.imgToAnns[instance]
47 |         pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
48 |         pred_mask = decode(pred_annot['segmentation'])
49 |         iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0),
50 |                                                torch.tensor(gt_mask).unsqueeze(0))
51 |         iou, intersection, union = iou.item(), intersection.item(), union.item()
52 |         for iou_threshold in counters_by_iou.keys():
53 |             if iou > iou_threshold:
54 |                 counters_by_iou[iou_threshold] += 1
55 |         total_intersection_area += intersection
56 |         total_union_area += union
57 |         ious_list.append(iou)
58 |     num_samples = len(ious_list)
59 |     precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
60 |     overall_iou = total_intersection_area / total_union_area
61 |     mean_iou = np.mean(ious_list)
62 |     return precision_at_k, overall_iou, mean_iou
63 | 
64 | # bbox
65 | def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
66 |     print('evaluating bbox precision@k & iou metrics...')
67 |     counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
68 |     total_intersection_area = 0
69 |     total_union_area = 0
70 |     ious_list = []
71 |     for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
72 |         gt_annot = coco_gt.imgToAnns[instance][0]
73 |         gt_bbox = gt_annot['bbox'] # xywh
74 |         gt_bbox = [
75 |             gt_bbox[0],
76 |             gt_bbox[1],
77 |             gt_bbox[2] + gt_bbox[0],
78 |             gt_bbox[3] + gt_bbox[1],
79 |         ]
80 |         pred_annots = coco_pred.imgToAnns[instance]
81 |         pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
82 |         pred_bbox = pred_annot['bbox']  # xyxy
83 |         iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0),
84 |                                                torch.tensor(gt_bbox).unsqueeze(0))
85 |         iou, intersection, union = iou.item(), intersection.item(), union.item()
86 |         for iou_threshold in counters_by_iou.keys():
87 |             if iou > iou_threshold:
88 |                 counters_by_iou[iou_threshold] += 1
89 |         total_intersection_area += intersection
90 |         total_union_area += union
91 |         ious_list.append(iou)
92 |     num_samples = len(ious_list)
93 |     precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
94 |     overall_iou = total_intersection_area / total_union_area
95 |     mean_iou = np.mean(ious_list)
96 |     return precision_at_k, overall_iou, mean_iou
97 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for bounding box manipulation and GIoU.
  3 | """
  4 | import torch
  5 | from torchvision.ops.boxes import box_area
  6 | 
  7 | def clip_iou(boxes1,boxes2):
  8 |     area1 = box_area(boxes1)
  9 |     area2 = box_area(boxes2)
 10 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])
 11 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])
 12 |     wh = (rb - lt).clamp(min=0)
 13 |     inter = wh[:,0] * wh[:,1]
 14 |     union = area1 + area2 - inter
 15 |     iou = (inter + 1e-6) / (union+1e-6)
 16 |     return iou
 17 | 
 18 | def multi_iou(boxes1, boxes2):
 19 |     lt = torch.max(boxes1[...,:2], boxes2[...,:2])
 20 |     rb = torch.min(boxes1[...,2:], boxes2[...,2:])
 21 |     wh = (rb - lt).clamp(min=0)
 22 |     wh_1 = boxes1[...,2:] - boxes1[...,:2]
 23 |     wh_2 = boxes2[...,2:] - boxes2[...,:2]
 24 |     inter = wh[...,0] * wh[...,1]
 25 |     union = wh_1[...,0] * wh_1[...,1] + wh_2[...,0] * wh_2[...,1] - inter
 26 |     iou = (inter + 1e-6) / (union + 1e-6)
 27 |     return iou
 28 | 
 29 | def box_cxcywh_to_xyxy(x):
 30 |     x_c, y_c, w, h = x.unbind(-1)
 31 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 32 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 33 |     return torch.stack(b, dim=-1)
 34 | 
 35 | 
 36 | def box_xyxy_to_cxcywh(x):
 37 |     x0, y0, x1, y1 = x.unbind(-1)
 38 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 39 |          (x1 - x0), (y1 - y0)]
 40 |     return torch.stack(b, dim=-1)
 41 | 
 42 | 
 43 | # modified from torchvision to also return the union
 44 | def box_iou(boxes1, boxes2):
 45 |     area1 = box_area(boxes1)
 46 |     area2 = box_area(boxes2)
 47 | 
 48 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 49 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 50 | 
 51 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 52 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 53 | 
 54 |     union = area1[:, None] + area2 - inter
 55 | 
 56 |     iou = (inter+1e-6) / (union+1e-6)
 57 |     return iou, union
 58 | 
 59 | 
 60 | def generalized_box_iou(boxes1, boxes2):
 61 |     """
 62 |     Generalized IoU from https://giou.stanford.edu/
 63 | 
 64 |     The boxes should be in [x0, y0, x1, y1] format
 65 | 
 66 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 67 |     and M = len(boxes2)
 68 |     """
 69 |     # degenerate boxes gives inf / nan results
 70 |     # so do an early check
 71 |     # if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
 72 |     #     for i in range(boxes1.shape[0]):
 73 |     #         if not (boxes1[i, 2:] >= boxes1[i, :2]).all():
 74 |     #             boxes1[i] = torch.zeros_like(boxes1[i])
 75 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all(), "error boxes: {} vs {}.".format(boxes1, boxes2)
 76 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all(), "error boxes: {} vs {}.".format(boxes1, boxes2)
 77 |     iou, union = box_iou(boxes1, boxes2)
 78 | 
 79 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 80 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 81 | 
 82 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 83 |     area = wh[:, :, 0] * wh[:, :, 1]
 84 | 
 85 |     return iou - ((area - union) + 1e-6) / (area + 1e-6)
 86 | 
 87 | 
 88 | def masks_to_boxes(masks):
 89 |     """Compute the bounding boxes around the provided masks
 90 | 
 91 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
 92 | 
 93 |     Returns a [N, 4] tensors, with the boxes in xyxy format
 94 |     """
 95 |     if masks.numel() == 0:
 96 |         return torch.zeros((0, 4), device=masks.device)
 97 | 
 98 |     h, w = masks.shape[-2:]
 99 | 
100 |     y = torch.arange(0, h, dtype=torch.float)
101 |     x = torch.arange(0, w, dtype=torch.float)
102 |     y, x = torch.meshgrid(y, x)
103 | 
104 |     x_mask = (masks * x.unsqueeze(0))
105 |     x_max = x_mask.flatten(1).max(-1)[0]
106 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
107 | 
108 |     y_mask = (masks * y.unsqueeze(0))
109 |     y_max = y_mask.flatten(1).max(-1)[0]
110 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
111 | 
112 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
113 | 
114 | def center_of_mass(bitmasks):
115 |     _, h, w = bitmasks.size()
116 | 
117 |     ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device)
118 |     xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device)
119 | 
120 |     m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
121 |     m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1)
122 |     m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1)
123 |     center_x = m10 / m00
124 |     center_y = m01 / m00
125 |     return center_x, center_y
126 | 


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Dumps things to tensorboard and console
  3 | """
  4 | 
  5 | import os
  6 | import warnings
  7 | # import git
  8 | 
  9 | import torchvision.transforms as transforms
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | 
 13 | def tensor_to_numpy(image):
 14 |     image_np = (image.numpy() * 255).astype('uint8')
 15 |     return image_np
 16 | 
 17 | def detach_to_cpu(x):
 18 |     return x.detach().cpu()
 19 | 
 20 | def fix_width_trunc(x):
 21 |     return ('{:.9s}'.format('{:0.9f}'.format(x)))
 22 | 
 23 | class TensorboardLogger:
 24 |     def __init__(self, short_id, id, local_rank):
 25 |         self.short_id = short_id
 26 |         if self.short_id == 'NULL':
 27 |             self.short_id = 'DEBUG'
 28 | 
 29 |         if id is None:
 30 |             self.no_log = True
 31 |             warnings.warn('Logging has been disbaled.')
 32 |         else:
 33 |             self.no_log = False
 34 | 
 35 |             self.inv_im_trans = transforms.Normalize(
 36 |                 mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
 37 |                 std=[1/0.229, 1/0.224, 1/0.225])
 38 | 
 39 |             self.inv_seg_trans = transforms.Normalize(
 40 |                 mean=[-0.5/0.5],
 41 |                 std=[1/0.5])
 42 | 
 43 |             log_path = os.path.join('..', 'log', '%s' % id)
 44 |             os.makedirs(log_path, exist_ok=True)
 45 |             self.logger = SummaryWriter(log_path)
 46 | 
 47 |         self.local_rank = local_rank
 48 |         self.values = {}
 49 |         self.counts = {}
 50 | 
 51 |     def log_scalar(self, tag, x, step):
 52 |         if self.no_log:
 53 |             warnings.warn('Logging has been disabled.')
 54 |             return
 55 |         self.logger.add_scalar(tag, x, step)
 56 | 
 57 |     def log_metrics(self, l1_tag, l2_tag, val, step, f=None):
 58 |         tag = l1_tag + '/' + l2_tag
 59 |         text = '{:s} - It {:6d} [{:5s}] [{:13}]: {:s}'.format(self.short_id, step, l1_tag.upper(), l2_tag, fix_width_trunc(val))
 60 |         if f is not None:
 61 |             f.write(text + '\n')
 62 |             f.flush()
 63 |         self.log_scalar(tag, val, step)
 64 | 
 65 |     def log_im(self, tag, x, step):
 66 |         if self.no_log:
 67 |             warnings.warn('Logging has been disabled.')
 68 |             return
 69 |         x = detach_to_cpu(x)
 70 |         x = self.inv_im_trans(x)
 71 |         x = tensor_to_numpy(x)
 72 |         self.logger.add_image(tag, x, step)
 73 | 
 74 |     def log_cv2(self, tag, x, step):
 75 |         if self.no_log:
 76 |             warnings.warn('Logging has been disabled.')
 77 |             return
 78 |         x = x.transpose((2, 0, 1))
 79 |         self.logger.add_image(tag, x, step)
 80 | 
 81 |     def log_seg(self, tag, x, step):
 82 |         if self.no_log:
 83 |             warnings.warn('Logging has been disabled.')
 84 |             return
 85 |         x = detach_to_cpu(x)
 86 |         x = self.inv_seg_trans(x)
 87 |         x = tensor_to_numpy(x)
 88 |         self.logger.add_image(tag, x, step)
 89 | 
 90 |     def log_gray(self, tag, x, step):
 91 |         if self.no_log:
 92 |             warnings.warn('Logging has been disabled.')
 93 |             return
 94 |         x = detach_to_cpu(x)
 95 |         x = tensor_to_numpy(x)
 96 |         self.logger.add_image(tag, x, step)
 97 | 
 98 |     def log_string(self, tag, x):
 99 |         print(tag, x)
100 |         if self.no_log:
101 |             warnings.warn('Logging has been disabled.')
102 |             return
103 |         self.logger.add_text(tag, x)
104 | 
105 |     def add_dict(self, tensor_dict, itr):
106 |         for k, v in tensor_dict.items():
107 |             self.add_tensor(k, v, itr)
108 | 
109 |     def add_tensor(self, key, tensor, itr):
110 |         if len(key.split("_")) == 3:
111 |             self.log_scalar("sublayer_loss/" + key, tensor, itr)
112 |         else:
113 |             self.log_scalar("main_loss/" + key, tensor, itr)
114 | 
115 | 
116 |     # def add_tensor(self, key, tensor, itr):
117 |     #     if key not in self.values:
118 |     #         self.counts[key] = 1
119 |     #         if type(tensor) == float or type(tensor) == int:
120 |     #             self.values[key] = tensor
121 |     #         else:
122 |     #             self.values[key] = tensor.mean().item()
123 |     #     else:
124 |     #         self.counts[key] += 1
125 |     #         if type(tensor) == float or type(tensor) == int:
126 |     #             self.values[key] += tensor
127 |     #         else:
128 |     #             self.values[key] += tensor.mean().item()
129 |     #
130 |     #     for k, v in self.values.items():
131 |     #         if len(k.split("_")) == 3:
132 |     #             self.log_scalar("sublayer_loss/" + k, v, itr)
133 |     #         else:
134 |     #             self.log_scalar("main_loss/"+k, v, itr)


--------------------------------------------------------------------------------
/datasets/image_to_seq_augmenter.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Modified from SeqFormer (https://github.com/wjf5203/SeqFormer)
  3 | # ------------------------------------------------------------------------
  4 | # Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg)
  5 | # ------------------------------------------------------------------------
  6 | 
  7 | 
  8 | import imgaug
  9 | import imgaug.augmenters as iaa
 10 | import numpy as np
 11 | 
 12 | from datetime import datetime
 13 | 
 14 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage
 15 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
 16 | 
 17 | 
 18 | class ImageToSeqAugmenter(object):
 19 |     def __init__(self, perspective=True, affine=True, motion_blur=True,
 20 |                  brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12,
 21 |                  scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20),
 22 |                  motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5):
 23 | 
 24 |         self.basic_augmenter = iaa.SomeOf((1, None), [
 25 |                 iaa.Add(brightness_range),
 26 |                 iaa.AddToHueAndSaturation(hue_saturation_range)
 27 |             ]
 28 |         )
 29 | 
 30 |         transforms = []
 31 |         if perspective:
 32 |             transforms.append(iaa.PerspectiveTransform(perspective_magnitude))
 33 |         if affine:
 34 |             transforms.append(iaa.Affine(scale=scale_range,
 35 |                                          translate_percent=translate_range,
 36 |                                          rotate=rotation_range,
 37 |                                          order=1,  # cv2.INTER_LINEAR
 38 |                                          backend='auto'))
 39 |         transforms = iaa.Sequential(transforms)
 40 |         transforms = [transforms]
 41 | 
 42 |         if motion_blur:
 43 |             blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf(
 44 |                 [
 45 |                     iaa.MotionBlur(ksize)
 46 |                     for ksize in motion_blur_kernel_sizes
 47 |                 ]
 48 |             ))
 49 |             transforms.append(blur)
 50 | 
 51 |         self.frame_shift_augmenter = iaa.Sequential(transforms)
 52 | 
 53 |     @staticmethod
 54 |     def condense_masks(instance_masks):
 55 |         condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8)
 56 |         for instance_id, mask in enumerate(instance_masks, 1):
 57 |             condensed_mask = np.where(mask, instance_id, condensed_mask)
 58 | 
 59 |         return condensed_mask
 60 | 
 61 |     @staticmethod
 62 |     def expand_masks(condensed_mask, num_instances):
 63 |         return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)]
 64 | 
 65 |     def __call__(self, image, masks=None, boxes=None):
 66 |         det_augmenter = self.frame_shift_augmenter.to_deterministic()
 67 | 
 68 | 
 69 |         if masks is not None:
 70 |             masks_np, is_binary_mask = [], []
 71 |             boxs_np = []
 72 | 
 73 |             for mask in masks:
 74 |                 
 75 |                 if isinstance(mask, np.ndarray):
 76 |                     masks_np.append(mask.astype(np.bool))
 77 |                     is_binary_mask.append(False)
 78 |                 else:
 79 |                     raise ValueError("Invalid mask type: {}".format(type(mask)))
 80 | 
 81 |             num_instances = len(masks_np)
 82 |             masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2])
 83 |             # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2])
 84 | 
 85 |             seed = int(datetime.now().strftime('%M%S%f')[-8:])
 86 |             imgaug.seed(seed)
 87 |             aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np)
 88 |             imgaug.seed(seed)
 89 |             # invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2)
 90 |             aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances)
 91 |             # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image()
 92 |             aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)]
 93 |             # (427, 640, 3) (427, 640)
 94 |             return aug_image, aug_masks #, aug_boxes.to_xyxy_array()
 95 | 
 96 |         else:
 97 |             # if no mask is provided, random generate and delete the mask.
 98 |             masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])]
 99 |             aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks)
100 |             return aug_image, invalid_pts_mask.get_arr() == 0
101 | 


--------------------------------------------------------------------------------
/models/backbone.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Backbone modules.
  3 | Modified from DETR (https://github.com/facebookresearch/detr)
  4 | """
  5 | from collections import OrderedDict
  6 | 
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import torchvision
 10 | from torch import nn
 11 | from torchvision.models._utils import IntermediateLayerGetter
 12 | from typing import Dict, List
 13 | from einops import rearrange
 14 | 
 15 | from util.misc import NestedTensor, is_main_process
 16 | 
 17 | from .position_encoding import build_position_encoding
 18 | 
 19 | 
 20 | class FrozenBatchNorm2d(torch.nn.Module):
 21 |     """
 22 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 23 | 
 24 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 25 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 26 |     produce nans.
 27 |     """
 28 | 
 29 |     def __init__(self, n):
 30 |         super(FrozenBatchNorm2d, self).__init__()
 31 |         self.register_buffer("weight", torch.ones(n))
 32 |         self.register_buffer("bias", torch.zeros(n))
 33 |         self.register_buffer("running_mean", torch.zeros(n))
 34 |         self.register_buffer("running_var", torch.ones(n))
 35 | 
 36 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 37 |                               missing_keys, unexpected_keys, error_msgs):
 38 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 39 |         if num_batches_tracked_key in state_dict:
 40 |             del state_dict[num_batches_tracked_key]
 41 | 
 42 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 43 |             state_dict, prefix, local_metadata, strict,
 44 |             missing_keys, unexpected_keys, error_msgs)
 45 | 
 46 |     def forward(self, x):
 47 |         # move reshapes to the beginning
 48 |         # to make it fuser-friendly
 49 |         w = self.weight.reshape(1, -1, 1, 1)
 50 |         b = self.bias.reshape(1, -1, 1, 1)
 51 |         rv = self.running_var.reshape(1, -1, 1, 1)
 52 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 53 |         eps = 1e-5
 54 |         scale = w * (rv + eps).rsqrt()
 55 |         bias = b - rm * scale
 56 |         return x * scale + bias
 57 | 
 58 | 
 59 | class BackboneBase(nn.Module):
 60 | 
 61 |     def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
 62 |         super().__init__()
 63 |         for name, parameter in backbone.named_parameters():
 64 |             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
 65 |                 parameter.requires_grad_(False)
 66 |         if return_interm_layers:
 67 |             return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
 68 |             # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} deformable detr
 69 |             self.strides = [4, 8, 16, 32]
 70 |             self.num_channels = [256, 512, 1024, 2048]
 71 |         else:
 72 |             return_layers = {'layer4': "0"}
 73 |             self.strides = [32]
 74 |             self.num_channels = [2048]
 75 |         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
 76 | 
 77 |     def forward(self, tensor_list: NestedTensor):
 78 |         xs = self.body(tensor_list.tensors)
 79 |         out: Dict[str, NestedTensor] = {}
 80 |         for name, x in xs.items():
 81 |             m = tensor_list.mask
 82 |             assert m is not None
 83 |             mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
 84 |             out[name] = NestedTensor(x, mask)
 85 |         return out
 86 | 
 87 | 
 88 | class Backbone(BackboneBase):
 89 |     """ResNet backbone with frozen BatchNorm."""
 90 |     def __init__(self, name: str,
 91 |                  train_backbone: bool,
 92 |                  return_interm_layers: bool,
 93 |                  dilation: bool):  # True
 94 |         backbone = getattr(torchvision.models, name)(
 95 |             replace_stride_with_dilation=[False, False, dilation],
 96 |             pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
 97 |         assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
 98 |         super().__init__(backbone, train_backbone, return_interm_layers)
 99 |         if dilation:
100 |             self.strides[-1] = self.strides[-1] // 2
101 | 
102 | 
103 | class Joiner(nn.Sequential):
104 |     def __init__(self, backbone, position_embedding):
105 |         super().__init__(backbone, position_embedding)
106 |         self.strides = backbone.strides
107 |         self.num_channels = backbone.num_channels
108 | 
109 | 
110 |     def forward(self, tensor_list: NestedTensor):
111 |         tensor_list.tensors = rearrange(tensor_list.tensors, 'b t c h w -> (b t) c h w')
112 |         tensor_list.mask = rearrange(tensor_list.mask, 'b t h w -> (b t) h w')
113 | 
114 |         xs = self[0](tensor_list)
115 |         out: List[NestedTensor] = []
116 |         pos = []
117 |         for name, x in xs.items():
118 |             out.append(x)
119 |             # position encoding
120 |             pos.append(self[1](x).to(x.tensors.dtype))
121 |         return out, pos
122 | 
123 | 
124 | def build_backbone(args):
125 |     position_embedding = build_position_encoding(args)
126 |     train_backbone = args.lr_backbone > 0
127 |     return_interm_layers = args.masks or (args.num)
128 |     backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
129 |     model = Joiner(backbone, position_embedding)
130 |     model.num_channels = backbone.num_channels
131 |     return model
132 | 
133 | 


--------------------------------------------------------------------------------
/pre_process/generate_anno_ytvos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import cv2
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import h5py
  7 | 
  8 | from data import img_transform, load_img_davis, load_video_a2d
  9 | from sim_model import SimModel
 10 | import numpy as np
 11 | from torchvision import transforms
 12 | from tqdm import tqdm
 13 | import json
 14 | from PIL import Image
 15 | 
 16 | transform = transforms.Compose(
 17 |     [
 18 |         transforms.ToTensor(),
 19 |         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
 20 |     ]
 21 | )
 22 | 
 23 | def bounding_box(img):
 24 |     rows = np.any(img, axis=1)
 25 |     cols = np.any(img, axis=0)
 26 |     rmin, rmax = np.where(rows)[0][[0, -1]]
 27 |     cmin, cmax = np.where(cols)[0][[0, -1]]
 28 |     return rmin, rmax, cmin, cmax  # y1, y2, x1, x2
 29 | 
 30 | def transform_anno_to_each_frame(meta_path, exp_meta_path):
 31 |     anno_dict = json.load(open(meta_path))
 32 |     exp_dict = json.load(open(exp_meta_path))
 33 |     # annos = anno_dict['videos']
 34 |     annos = exp_dict['videos']
 35 |     annos_out = {}
 36 |     for vid in annos.keys():
 37 |         if vid not in annos_out.keys():
 38 |             annos_out[vid] = {}
 39 |         obj_ids = []
 40 |         for exp_info_id in annos[vid]['expressions'].keys():
 41 |             obj_id = annos[vid]['expressions'][exp_info_id]['obj_id']
 42 |             if obj_id not in obj_ids:
 43 |                 obj_ids.append(obj_id)
 44 |         for frame_id in annos[vid]['frames']:
 45 |             if frame_id not in annos_out[vid].keys():
 46 |                 annos_out[vid][frame_id] = []
 47 |             annos_out[vid][frame_id] = obj_ids
 48 |     return annos_out
 49 | 
 50 | @ torch.no_grad()
 51 | def generate_mask(anno_dict, video_path, anno_path, save_path, model, cuda=True):
 52 |     for vid in tqdm(anno_dict.keys()):
 53 |         video_save_path = os.path.join(save_path, vid)
 54 |         if not os.path.exists(video_save_path):
 55 |             os.makedirs(video_save_path)
 56 |         for frame_id in anno_dict[vid].keys():
 57 |             if not os.path.exists(os.path.join(video_save_path, "{}.h5".format(frame_id))):
 58 |                 obj_ids = anno_dict[vid][frame_id]
 59 |                 frame = Image.open(os.path.join(video_path, vid, frame_id+'.jpg')).convert('RGB')
 60 |                 mask = Image.open(os.path.join(anno_path, vid, frame_id+'.png')).convert('P')
 61 |                 frame = transform(frame)
 62 |                 mask = np.array(mask)
 63 |                 h, w = mask.shape
 64 | 
 65 |                 centers = []
 66 |                 bboxes = []
 67 |                 centers_norm = []
 68 |                 instance_valid = []
 69 |                 obj_ids = [int(id) for id in obj_ids]
 70 |                 for obj_id in obj_ids:
 71 |                     mask_cur = ((mask==obj_id) * 255).astype(np.uint8)
 72 |                     if (mask_cur > 0).any():
 73 |                         dist = cv2.distanceTransform(
 74 |                             mask_cur, cv2.DIST_L2, 5, cv2.DIST_LABEL_PIXEL
 75 |                         )
 76 |                         _, _, _, center = cv2.minMaxLoc(dist)
 77 |                         center_norm = (center[0] / w, center[1] / h)
 78 |                         y1, y2, x1, x2 = bounding_box(mask_cur)
 79 |                         bbox = np.array([x1, y1, x2, y2])
 80 |                         bbox[0::2] = np.clip(bbox[0::2], 0, w)
 81 |                         bbox[1::2] = np.clip(bbox[1::2], 0, h)
 82 |                         bboxes.append(bbox)
 83 |                         centers.append(center)
 84 |                         centers_norm.append(center_norm)
 85 |                         instance_valid.append(1)
 86 |                     else:
 87 |                         bboxes.append(np.array([0, 0, 0, 0]))
 88 |                         centers.append([0, 0])
 89 |                         centers_norm.append([0, 0])
 90 |                         instance_valid.append(0)
 91 |                 if cuda:
 92 |                         frame = frame.cuda()
 93 | 
 94 |                 masks_point = model(frame[None], centers_norm, instance_valid, "point")
 95 |                 masks_bbox = model(frame[None], bboxes, instance_valid, "bbox")
 96 |                 masks_point = masks_point[0].cpu().numpy()
 97 |                 masks_bbox = masks_bbox[0].cpu().numpy()
 98 |                 out_annos = h5py.File(
 99 |                         os.path.join(video_save_path, "{}.h5".format(frame_id)), "w"
100 |                     )
101 |                 out_annos.create_dataset("obj_ids", data=obj_ids)
102 |                 out_annos.create_dataset("heatBBox", data=masks_bbox)
103 |                 out_annos.create_dataset("heatPoint", data=masks_point)
104 |                 out_annos.create_dataset("centerPoint", data=centers)
105 |                 out_annos.close()
106 |     
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     video_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/JPEGImages/"
111 |     anno_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/Annotations/"
112 |     save_path = "./anno_weak/ref-youtube-vos/train/AnnotationsWeakly/"
113 |     meta_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/train/meta.json"
114 |     exp_meta_path = "/media/HardDisk_B/Users/wx/wwk_files/datasets/referring_video_segmentation/Refer-YouTube-VOS/meta_expressions/train/meta_expressions.json"
115 |     dilation = False
116 |     cuda = True
117 | 
118 |     model = SimModel("resnet101", dilation)
119 |     if cuda:
120 |         model.cuda()
121 | 
122 |     annos_by_frame = transform_anno_to_each_frame(meta_path, exp_meta_path)
123 |     generate_mask(annos_by_frame, video_path, anno_path, save_path, model, cuda)
124 |     


--------------------------------------------------------------------------------
/datasets/coco.py:
--------------------------------------------------------------------------------
  1 | """
  2 | COCO dataset which returns image_id for evaluation.
  3 | 
  4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
  5 | """
  6 | from pathlib import Path
  7 | 
  8 | import torch
  9 | import torch.utils.data
 10 | import torchvision
 11 | from pycocotools import mask as coco_mask
 12 | 
 13 | import datasets.transforms as T
 14 | 
 15 | 
 16 | class CocoDetection(torchvision.datasets.CocoDetection):
 17 |     def __init__(self, img_folder, ann_file, transforms, return_masks):
 18 |         super(CocoDetection, self).__init__(img_folder, ann_file)
 19 |         self._transforms = transforms
 20 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 21 | 
 22 |     def __getitem__(self, idx):
 23 |         img, target = super(CocoDetection, self).__getitem__(idx)
 24 |         image_id = self.ids[idx]
 25 |         target = {'image_id': image_id, 'annotations': target}
 26 | 
 27 |         img, target = self.prepare(img, target)
 28 |         if self._transforms is not None:
 29 |             img, target = self._transforms(img, target)
 30 |         return img, target
 31 | 
 32 | 
 33 | def convert_coco_poly_to_mask(segmentations, height, width):
 34 |     masks = []
 35 |     for polygons in segmentations:
 36 |         rles = coco_mask.frPyObjects(polygons, height, width)
 37 |         mask = coco_mask.decode(rles)
 38 |         if len(mask.shape) < 3:
 39 |             mask = mask[..., None]
 40 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 41 |         mask = mask.any(dim=2)
 42 |         masks.append(mask)
 43 |     if masks:
 44 |         masks = torch.stack(masks, dim=0)
 45 |     else:
 46 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 47 |     return masks
 48 | 
 49 | 
 50 | class ConvertCocoPolysToMask(object):
 51 |     def __init__(self, return_masks=False):
 52 |         self.return_masks = return_masks
 53 | 
 54 |     def __call__(self, image, target):
 55 |         w, h = image.size
 56 | 
 57 |         image_id = target["image_id"]
 58 |         image_id = torch.tensor([image_id])
 59 | 
 60 |         anno = target["annotations"]
 61 | 
 62 |         anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
 63 | 
 64 |         boxes = [obj["bbox"] for obj in anno]
 65 |         # guard against no boxes via resizing
 66 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
 67 |         boxes[:, 2:] += boxes[:, :2]
 68 |         boxes[:, 0::2].clamp_(min=0, max=w)
 69 |         boxes[:, 1::2].clamp_(min=0, max=h)
 70 | 
 71 |         classes = [obj["category_id"] for obj in anno]
 72 |         classes = torch.tensor(classes, dtype=torch.int64)
 73 | 
 74 |         if self.return_masks:
 75 |             segmentations = [obj["segmentation"] for obj in anno]
 76 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
 77 | 
 78 |         keypoints = None
 79 |         if anno and "keypoints" in anno[0]:
 80 |             keypoints = [obj["keypoints"] for obj in anno]
 81 |             keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
 82 |             num_keypoints = keypoints.shape[0]
 83 |             if num_keypoints:
 84 |                 keypoints = keypoints.view(num_keypoints, -1, 3)
 85 | 
 86 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
 87 |         boxes = boxes[keep]
 88 |         classes = classes[keep]
 89 |         if self.return_masks:
 90 |             masks = masks[keep]
 91 |         if keypoints is not None:
 92 |             keypoints = keypoints[keep]
 93 | 
 94 |         target = {}
 95 |         target["boxes"] = boxes
 96 |         target["labels"] = classes
 97 |         if self.return_masks:
 98 |             target["masks"] = masks
 99 |         target["image_id"] = image_id
100 |         if keypoints is not None:
101 |             target["keypoints"] = keypoints
102 | 
103 |         # for conversion to coco api
104 |         area = torch.tensor([obj["area"] for obj in anno])
105 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
106 |         target["area"] = area[keep]
107 |         target["iscrowd"] = iscrowd[keep]
108 | 
109 |         target["orig_size"] = torch.as_tensor([int(h), int(w)])
110 |         target["size"] = torch.as_tensor([int(h), int(w)])
111 | 
112 |         return image, target
113 | 
114 | 
115 | def make_coco_transforms(image_set):
116 | 
117 |     normalize = T.Compose([
118 |         T.ToTensor(),
119 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
120 |     ])
121 | 
122 |     scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
123 | 
124 |     if image_set == 'train':
125 |         return T.Compose([
126 |             T.RandomHorizontalFlip(),
127 |             T.RandomSelect(
128 |                 T.RandomResize(scales, max_size=1333),
129 |                 T.Compose([
130 |                     T.RandomResize([400, 500, 600]),
131 |                     T.RandomSizeCrop(384, 600),
132 |                     T.RandomResize(scales, max_size=1333),
133 |                 ])
134 |             ),
135 |             normalize,
136 |         ])
137 | 
138 |     if image_set == 'val':
139 |         return T.Compose([
140 |             T.RandomResize([800], max_size=1333),
141 |             normalize,
142 |         ])
143 | 
144 |     raise ValueError(f'unknown {image_set}')
145 | 
146 | 
147 | def build(image_set, args):
148 |     root = Path(args.coco_path)
149 |     assert root.exists(), f'provided COCO path {root} does not exist'
150 |     mode = 'instances'
151 |     PATHS = {
152 |         "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
153 |         "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
154 |     }
155 |     img_folder, ann_file = PATHS[image_set]
156 |     dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
157 |     return dataset
158 | 


--------------------------------------------------------------------------------
/davis2017/davis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from glob import glob
  3 | from collections import defaultdict
  4 | import numpy as np
  5 | from PIL import Image
  6 | 
  7 | 
  8 | class DAVIS(object):
  9 |     SUBSET_OPTIONS = ['train', 'val', 'test-dev', 'test-challenge']
 10 |     TASKS = ['semi-supervised', 'unsupervised']
 11 |     DATASET_WEB = 'https://davischallenge.org/davis2017/code.html'
 12 |     VOID_LABEL = 255
 13 | 
 14 |     def __init__(self, root, task='unsupervised', subset='val', sequences='all', resolution='480p', codalab=False):
 15 |         """
 16 |         Class to read the DAVIS dataset
 17 |         :param root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
 18 |         :param task: Task to load the annotations, choose between semi-supervised or unsupervised.
 19 |         :param subset: Set to load the annotations
 20 |         :param sequences: Sequences to consider, 'all' to use all the sequences in a set.
 21 |         :param resolution: Specify the resolution to use the dataset, choose between '480' and 'Full-Resolution'
 22 |         """
 23 |         if subset not in self.SUBSET_OPTIONS:
 24 |             raise ValueError(f'Subset should be in {self.SUBSET_OPTIONS}')
 25 |         if task not in self.TASKS:
 26 |             raise ValueError(f'The only tasks that are supported are {self.TASKS}')
 27 | 
 28 |         self.task = task
 29 |         self.subset = subset
 30 |         self.root = root
 31 |         self.img_path = os.path.join(self.root, 'JPEGImages', resolution)
 32 |         annotations_folder = 'Annotations' if task == 'semi-supervised' else 'Annotations_unsupervised'
 33 |         self.mask_path = os.path.join(self.root, annotations_folder, resolution)
 34 |         year = '2019' if task == 'unsupervised' and (subset == 'test-dev' or subset == 'test-challenge') else '2017'
 35 |         self.imagesets_path = os.path.join(self.root, 'ImageSets', year)
 36 | 
 37 |         self._check_directories()
 38 | 
 39 |         if sequences == 'all':
 40 |             with open(os.path.join(self.imagesets_path, f'{self.subset}.txt'), 'r') as f:
 41 |                 tmp = f.readlines()
 42 |             sequences_names = [x.strip() for x in tmp]
 43 |         else:
 44 |             sequences_names = sequences if isinstance(sequences, list) else [sequences]
 45 |         self.sequences = defaultdict(dict)
 46 | 
 47 |         for seq in sequences_names:
 48 |             images = np.sort(glob(os.path.join(self.img_path, seq, '*.jpg'))).tolist()
 49 |             if len(images) == 0 and not codalab:
 50 |                 raise FileNotFoundError(f'Images for sequence {seq} not found.')
 51 |             self.sequences[seq]['images'] = images
 52 |             masks = np.sort(glob(os.path.join(self.mask_path, seq, '*.png'))).tolist()
 53 |             masks.extend([-1] * (len(images) - len(masks)))
 54 |             self.sequences[seq]['masks'] = masks
 55 | 
 56 |     def _check_directories(self):
 57 |         if not os.path.exists(self.root):
 58 |             raise FileNotFoundError(f'DAVIS not found in the specified directory, download it from {self.DATASET_WEB}')
 59 |         if not os.path.exists(os.path.join(self.imagesets_path, f'{self.subset}.txt')):
 60 |             raise FileNotFoundError(f'Subset sequences list for {self.subset} not found, download the missing subset '
 61 |                                     f'for the {self.task} task from {self.DATASET_WEB}')
 62 |         if self.subset in ['train', 'val'] and not os.path.exists(self.mask_path):
 63 |             raise FileNotFoundError(f'Annotations folder for the {self.task} task not found, download it from {self.DATASET_WEB}')
 64 | 
 65 |     def get_frames(self, sequence):
 66 |         for img, msk in zip(self.sequences[sequence]['images'], self.sequences[sequence]['masks']):
 67 |             image = np.array(Image.open(img))
 68 |             mask = None if msk is None else np.array(Image.open(msk))
 69 |             yield image, mask
 70 | 
 71 |     def _get_all_elements(self, sequence, obj_type):
 72 |         obj = np.array(Image.open(self.sequences[sequence][obj_type][0]))
 73 |         all_objs = np.zeros((len(self.sequences[sequence][obj_type]), *obj.shape))
 74 |         obj_id = []
 75 |         for i, obj in enumerate(self.sequences[sequence][obj_type]):
 76 |             all_objs[i, ...] = np.array(Image.open(obj))
 77 |             obj_id.append(''.join(obj.split('/')[-1].split('.')[:-1]))
 78 |         return all_objs, obj_id
 79 | 
 80 |     def get_all_images(self, sequence):
 81 |         return self._get_all_elements(sequence, 'images')
 82 | 
 83 |     def get_all_masks(self, sequence, separate_objects_masks=False):
 84 |         masks, masks_id = self._get_all_elements(sequence, 'masks')
 85 |         masks_void = np.zeros_like(masks)
 86 | 
 87 |         # Separate void and object masks
 88 |         for i in range(masks.shape[0]):
 89 |             masks_void[i, ...] = masks[i, ...] == 255
 90 |             masks[i, masks[i, ...] == 255] = 0
 91 | 
 92 |         if separate_objects_masks:
 93 |             num_objects = int(np.max(masks[0, ...]))
 94 |             tmp = np.ones((num_objects, *masks.shape))
 95 |             tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
 96 |             masks = (tmp == masks[None, ...])
 97 |             masks = masks > 0
 98 |         return masks, masks_void, masks_id
 99 | 
100 |     def get_sequences(self):
101 |         for seq in self.sequences:
102 |             yield seq
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     from matplotlib import pyplot as plt
107 | 
108 |     only_first_frame = True
109 |     subsets = ['train', 'val']
110 | 
111 |     for s in subsets:
112 |         dataset = DAVIS(root='/home/csergi/scratch2/Databases/DAVIS2017_private', subset=s)
113 |         for seq in dataset.get_sequences():
114 |             g = dataset.get_frames(seq)
115 |             img, mask = next(g)
116 |             plt.subplot(2, 1, 1)
117 |             plt.title(seq)
118 |             plt.imshow(img)
119 |             plt.subplot(2, 1, 2)
120 |             plt.imshow(mask)
121 |             plt.show(block=True)
122 | 
123 | 


--------------------------------------------------------------------------------
/util/visualization.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import torch
  4 | 
  5 | 
  6 | def generate_ce_weight(heatmap, size, box=None, alpha=0.7, beta=0.3, thres=0.5):
  7 |     weight = heatmap
  8 |     weight[weight>alpha] = alpha
  9 |     weight[weight<beta] = beta
 10 |     weight = np.abs(weight - thres)
 11 |     weight = (weight - weight.min()) / (weight.max() - weight.min())
 12 | 
 13 |     # foreground = (heatmap >= alpha).astype(float)
 14 |     # background = (heatmap <= beta).astype(float)
 15 |     # uncertain = np.logical_and(heatmap > beta, heatmap < alpha).astype(float)
 16 |     # uncertain_weight = np.abs(heatmap - thres)  + 0.5
 17 |     if box is not None:
 18 |         box_regions = np.zeros_like(heatmap)
 19 |         h, w = size
 20 |         box = np.array([box[0] - box[2] / 2, box[1] - box[3] / 2, box[0] + box[2] / 2, box[1] + box[3] / 2])
 21 |         boxes_scale = (box * np.array([w, h, w, h])).astype(int)
 22 |         box_regions[
 23 |             boxes_scale[1] : boxes_scale[3], boxes_scale[0] : boxes_scale[2]
 24 |         ] = 1
 25 |         weight[box_regions==0] = 1
 26 |         # background[np.where(box_regions == 0)] = 1
 27 |         # foreground[np.where(box_regions == 0)] = 0
 28 |         # uncertain[np.where(box_regions == 0)] = 0
 29 |     # weight = foreground * 1 + background * 1 + uncertain_weight * uncertain
 30 |     # weight = box_regions
 31 |     return weight
 32 | 
 33 | 
 34 | def generate_mask_from_heatmap(heatmap, thres=0.5):
 35 |     background = np.ones((1, heatmap.shape[-2], heatmap.shape[-1])) * thres
 36 |     masks_with_bg = np.concatenate([background, heatmap])
 37 |     masks = np.zeros_like(masks_with_bg)
 38 |     max_idx = np.argmax(masks_with_bg, axis=0)
 39 |     for i in range(masks.shape[0]):
 40 |         masks[i, max_idx == i] = 1
 41 |     return masks[1:]
 42 | 
 43 | 
 44 | def viz_heatmap(heatmap, rgb_img=None):
 45 |     # heatmap: [h, w] \in [0,1]  rgb_img: [h, w, 3] np.array
 46 |     out_img = rgb_img.copy()
 47 |     heatmap = (heatmap * 255).astype(np.uint8)
 48 |     heatmap = cv2.applyColorMap(heatmap, 11)
 49 |     out_img = cv2.addWeighted(out_img, 1, heatmap, 0.6, 1)
 50 |     return out_img
 51 | 
 52 | 
 53 | def viz_bbox(bbox, size, rgb_img):
 54 |     # bbox: [4] \in [0,1]  rgb_img: [h, w, 3] np.array
 55 |     out_img = rgb_img.copy()
 56 |     h, w = size
 57 |     x_c, y_c, bw, bh = bbox
 58 |     bbox_xyxy = np.array(
 59 |         [(x_c - 0.5 * bw), (y_c - 0.5 * bh), (x_c + 0.5 * bw), (y_c + 0.5 * bh)]
 60 |     )
 61 |     bbox_scale = (bbox_xyxy * np.array([w, h, w, h])).astype(int)
 62 |     out_img = cv2.rectangle(
 63 |         out_img,
 64 |         (bbox_scale[0], bbox_scale[1]),
 65 |         (bbox_scale[2], bbox_scale[3]),
 66 |         (255, 0, 0),
 67 |         3,
 68 |     )
 69 |     return out_img
 70 | 
 71 | 
 72 | def viz_point(point, size, rgb_img):
 73 |     # point: [2] \in [0,1]  rgb_img: [h, w, 3] np.array
 74 |     out_img = rgb_img.copy()
 75 |     h, w = size
 76 |     point_scale = (point * np.array([w, h])).astype(int)
 77 |     out_img = cv2.circle(out_img, (point_scale[0], point_scale[1]), 3, (255, 0, 0), -1)
 78 |     return out_img
 79 | 
 80 | 
 81 | def viz_mask(mask, rgb_img=None):
 82 |     # mask: [h, w] \in {0,1}  rgb_img: [h, w, 3] np.array
 83 |     out_img = rgb_img.copy()
 84 |     mask_color = np.zeros((mask.shape[0], mask.shape[1], 3))
 85 |     mask_color[:, :, 0] = mask * 255
 86 |     out_img = cv2.addWeighted(out_img, 1, mask_color.astype(np.uint8), 0.6, 1)
 87 |     return out_img
 88 | 
 89 | 
 90 | def img_recover(img):
 91 |     # img: tensor [3, h, w]
 92 |     mean = torch.tensor([0.485, 0.456, 0.406])[:, None, None].to(img.device)
 93 |     std = torch.tensor([0.229, 0.224, 0.225])[:, None, None].to(img.device)
 94 |     img_recovered = (img * std + mean) * 255
 95 |     img_recovered = img_recovered.byte().permute(1, 2, 0).cpu().numpy()
 96 |     return img_recovered
 97 | 
 98 | 
 99 | def visualize(samples, targets):
100 |     viz_dict = {}
101 |     for i, (frames, target) in enumerate(zip(samples.tensors, targets)):
102 |         h, w = target["size"].numpy()
103 |         if "valid_indices" in target.keys():
104 |             valid_frame = frames.index_select(0, target["valid_indices"])  # [1, 3, h, w]
105 |             frames = valid_frame[:, :, :h, :w]
106 |         else:
107 |             frames = frames[:, :, :h, :w]
108 |         for frame_id, frame in enumerate(frames):
109 |             rgb_frame = img_recover(frame)  # [h, w, 3] np.array
110 |             rgb_frame = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR)
111 |             masks, boxes, weak_masks, weights = (
112 |                 target["masks"].cpu().numpy(),
113 |                 target["boxes"].cpu().numpy(),
114 |                 target["weak_masks"].cpu().numpy(),
115 |                 target["weights"].cpu().numpy(),
116 |             )
117 |             # weak_mask_box = generate_mask_from_heatmap(heat_bbox)
118 |             # weak_mask_point = generate_mask_from_heatmap(heat_point)
119 |             for j, (mask, box, weight, weak_m) in enumerate(
120 |                 zip(
121 |                     masks,
122 |                     boxes,
123 |                     weights,
124 |                     weak_masks
125 |                 )
126 |             ):
127 |                 weight_p = generate_ce_weight(weight, (h, w), box)
128 |                 img_masked = viz_mask(mask, rgb_frame)
129 |                 img_bbox = viz_bbox(box, (h, w), rgb_frame)
130 |                 img_masked_weak = viz_mask(weak_m, rgb_frame)
131 |                 img_heat = viz_heatmap(weight, rgb_frame)
132 |                 img_heat_p = viz_heatmap(weight_p, rgb_frame)
133 |                 final_viz = np.concatenate(
134 |                     [
135 |                         img_masked,
136 |                         img_bbox,
137 |                         img_heat,
138 |                         img_masked_weak,
139 |                         img_heat_p
140 |                     ],
141 |                     axis=1,
142 |                 )
143 |                 viz_dict["batch{}_frame{}_instance{}".format(i, frame_id, j)] = final_viz
144 | 
145 |     return viz_dict
146 | 


--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from codes in torch.utils.data.distributed
  7 | # ------------------------------------------------------------------------
  8 | 
  9 | import os
 10 | import math
 11 | import torch
 12 | import torch.distributed as dist
 13 | from torch.utils.data.sampler import Sampler
 14 | 
 15 | 
 16 | class DistributedSampler(Sampler):
 17 |     """Sampler that restricts data loading to a subset of the dataset.
 18 |     It is especially useful in conjunction with
 19 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 20 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 21 |     and load a subset of the original dataset that is exclusive to it.
 22 |     .. note::
 23 |         Dataset is assumed to be of constant size.
 24 |     Arguments:
 25 |         dataset: Dataset used for sampling.
 26 |         num_replicas (optional): Number of processes participating in
 27 |             distributed training.
 28 |         rank (optional): Rank of the current process within num_replicas.
 29 |     """
 30 | 
 31 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 32 |         if num_replicas is None:
 33 |             if not dist.is_available():
 34 |                 raise RuntimeError("Requires distributed package to be available")
 35 |             num_replicas = dist.get_world_size()
 36 |         if rank is None:
 37 |             if not dist.is_available():
 38 |                 raise RuntimeError("Requires distributed package to be available")
 39 |             rank = dist.get_rank()
 40 |         self.dataset = dataset
 41 |         self.num_replicas = num_replicas
 42 |         self.rank = rank
 43 |         self.epoch = 0
 44 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 45 |         self.total_size = self.num_samples * self.num_replicas
 46 |         self.shuffle = shuffle
 47 | 
 48 |     def __iter__(self):
 49 |         if self.shuffle:
 50 |             # deterministically shuffle based on epoch
 51 |             g = torch.Generator()
 52 |             g.manual_seed(self.epoch)
 53 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 54 |         else:
 55 |             indices = torch.arange(len(self.dataset)).tolist()
 56 | 
 57 |         # add extra samples to make it evenly divisible
 58 |         indices += indices[: (self.total_size - len(indices))]
 59 |         assert len(indices) == self.total_size
 60 | 
 61 |         # subsample
 62 |         offset = self.num_samples * self.rank
 63 |         indices = indices[offset : offset + self.num_samples]
 64 |         assert len(indices) == self.num_samples
 65 | 
 66 |         return iter(indices)
 67 | 
 68 |     def __len__(self):
 69 |         return self.num_samples
 70 | 
 71 |     def set_epoch(self, epoch):
 72 |         self.epoch = epoch
 73 | 
 74 | 
 75 | class NodeDistributedSampler(Sampler):
 76 |     """Sampler that restricts data loading to a subset of the dataset.
 77 |     It is especially useful in conjunction with
 78 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 79 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 80 |     and load a subset of the original dataset that is exclusive to it.
 81 |     .. note::
 82 |         Dataset is assumed to be of constant size.
 83 |     Arguments:
 84 |         dataset: Dataset used for sampling.
 85 |         num_replicas (optional): Number of processes participating in
 86 |             distributed training.
 87 |         rank (optional): Rank of the current process within num_replicas.
 88 |     """
 89 | 
 90 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 91 |         if num_replicas is None:
 92 |             if not dist.is_available():
 93 |                 raise RuntimeError("Requires distributed package to be available")
 94 |             num_replicas = dist.get_world_size()
 95 |         if rank is None:
 96 |             if not dist.is_available():
 97 |                 raise RuntimeError("Requires distributed package to be available")
 98 |             rank = dist.get_rank()
 99 |         if local_rank is None:
100 |             local_rank = int(os.environ.get('LOCAL_RANK', 0))
101 |         if local_size is None:
102 |             local_size = int(os.environ.get('LOCAL_SIZE', 1))
103 |         self.dataset = dataset
104 |         self.shuffle = shuffle
105 |         self.num_replicas = num_replicas
106 |         self.num_parts = local_size
107 |         self.rank = rank
108 |         self.local_rank = local_rank
109 |         self.epoch = 0
110 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
111 |         self.total_size = self.num_samples * self.num_replicas
112 | 
113 |         self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
114 | 
115 |     def __iter__(self):
116 |         if self.shuffle:
117 |             # deterministically shuffle based on epoch
118 |             g = torch.Generator()
119 |             g.manual_seed(self.epoch)
120 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
121 |         else:
122 |             indices = torch.arange(len(self.dataset)).tolist()
123 |         indices = [i for i in indices if i % self.num_parts == self.local_rank]
124 | 
125 |         # add extra samples to make it evenly divisible
126 |         indices += indices[:(self.total_size_parts - len(indices))]
127 |         assert len(indices) == self.total_size_parts
128 | 
129 |         # subsample
130 |         indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
131 |         assert len(indices) == self.num_samples
132 | 
133 |         return iter(indices)
134 | 
135 |     def __len__(self):
136 |         return self.num_samples
137 | 
138 |     def set_epoch(self, epoch):
139 |         self.epoch = epoch
140 | 


--------------------------------------------------------------------------------
/davis2017/evaluation.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from tqdm import tqdm
  3 | import warnings
  4 | warnings.filterwarnings("ignore", category=RuntimeWarning)
  5 | 
  6 | import numpy as np
  7 | from davis2017.davis import DAVIS
  8 | from davis2017.metrics import db_eval_boundary, db_eval_iou
  9 | from davis2017 import utils
 10 | from davis2017.results import Results
 11 | from scipy.optimize import linear_sum_assignment
 12 | 
 13 | 
 14 | class DAVISEvaluation(object):
 15 |     def __init__(self, davis_root, task, gt_set, sequences='all', codalab=False):
 16 |         """
 17 |         Class to evaluate DAVIS sequences from a certain set and for a certain task
 18 |         :param davis_root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
 19 |         :param task: Task to compute the evaluation, chose between semi-supervised or unsupervised.
 20 |         :param gt_set: Set to compute the evaluation
 21 |         :param sequences: Sequences to consider for the evaluation, 'all' to use all the sequences in a set.
 22 |         """
 23 |         self.davis_root = davis_root
 24 |         self.task = task
 25 |         self.dataset = DAVIS(root=davis_root, task=task, subset=gt_set, sequences=sequences, codalab=codalab)
 26 | 
 27 |     @staticmethod
 28 |     def _evaluate_semisupervised(all_gt_masks, all_res_masks, all_void_masks, metric):
 29 |         if all_res_masks.shape[0] > all_gt_masks.shape[0]:
 30 |             sys.stdout.write("\nIn your PNG files there is an index higher than the number of objects in the sequence!")
 31 |             sys.exit()
 32 |         elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
 33 |             zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
 34 |             all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
 35 |         j_metrics_res, f_metrics_res = np.zeros(all_gt_masks.shape[:2]), np.zeros(all_gt_masks.shape[:2])
 36 |         for ii in range(all_gt_masks.shape[0]):
 37 |             if 'J' in metric:
 38 |                 j_metrics_res[ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
 39 |             if 'F' in metric:
 40 |                 f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
 41 |         return j_metrics_res, f_metrics_res
 42 | 
 43 |     @staticmethod
 44 |     def _evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric, max_n_proposals=20):
 45 |         if all_res_masks.shape[0] > max_n_proposals:
 46 |             sys.stdout.write(f"\nIn your PNG files there is an index higher than the maximum number ({max_n_proposals}) of proposals allowed!")
 47 |             sys.exit()
 48 |         elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
 49 |             zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
 50 |             all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
 51 |         j_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
 52 |         f_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
 53 |         for ii in range(all_gt_masks.shape[0]):
 54 |             for jj in range(all_res_masks.shape[0]):
 55 |                 if 'J' in metric:
 56 |                     j_metrics_res[jj, ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
 57 |                 if 'F' in metric:
 58 |                     f_metrics_res[jj, ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
 59 |         if 'J' in metric and 'F' in metric:
 60 |             all_metrics = (np.mean(j_metrics_res, axis=2) + np.mean(f_metrics_res, axis=2)) / 2
 61 |         else:
 62 |             all_metrics = np.mean(j_metrics_res, axis=2) if 'J' in metric else np.mean(f_metrics_res, axis=2)
 63 |         row_ind, col_ind = linear_sum_assignment(-all_metrics)
 64 |         return j_metrics_res[row_ind, col_ind, :], f_metrics_res[row_ind, col_ind, :]
 65 | 
 66 |     def evaluate(self, res_path, metric=('J', 'F'), debug=False):
 67 |         metric = metric if isinstance(metric, tuple) or isinstance(metric, list) else [metric]
 68 |         if 'T' in metric:
 69 |             raise ValueError('Temporal metric not supported!')
 70 |         if 'J' not in metric and 'F' not in metric:
 71 |             raise ValueError('Metric possible values are J for IoU or F for Boundary')
 72 | 
 73 |         # Containers
 74 |         metrics_res = {}
 75 |         if 'J' in metric:
 76 |             metrics_res['J'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
 77 |         if 'F' in metric:
 78 |             metrics_res['F'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
 79 | 
 80 |         # Sweep all sequences
 81 |         results = Results(root_dir=res_path)
 82 |         for seq in tqdm(list(self.dataset.get_sequences())):
 83 |             all_gt_masks, all_void_masks, all_masks_id = self.dataset.get_all_masks(seq, True)
 84 |             if self.task == 'semi-supervised':
 85 |                 all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1]
 86 |             all_res_masks = results.read_masks(seq, all_masks_id)
 87 |             if self.task == 'unsupervised':
 88 |                 j_metrics_res, f_metrics_res = self._evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric)
 89 |             elif self.task == 'semi-supervised':
 90 |                 j_metrics_res, f_metrics_res = self._evaluate_semisupervised(all_gt_masks, all_res_masks, None, metric)
 91 |             for ii in range(all_gt_masks.shape[0]):
 92 |                 seq_name = f'{seq}_{ii+1}'
 93 |                 if 'J' in metric:
 94 |                     [JM, JR, JD] = utils.db_statistics(j_metrics_res[ii])
 95 |                     metrics_res['J']["M"].append(JM)
 96 |                     metrics_res['J']["R"].append(JR)
 97 |                     metrics_res['J']["D"].append(JD)
 98 |                     metrics_res['J']["M_per_object"][seq_name] = JM
 99 |                 if 'F' in metric:
100 |                     [FM, FR, FD] = utils.db_statistics(f_metrics_res[ii])
101 |                     metrics_res['F']["M"].append(FM)
102 |                     metrics_res['F']["R"].append(FR)
103 |                     metrics_res['F']["D"].append(FD)
104 |                     metrics_res['F']["M_per_object"][seq_name] = FM
105 | 
106 |             # Show progress
107 |             if debug:
108 |                 sys.stdout.write(seq + '\n')
109 |                 sys.stdout.flush()
110 |         return metrics_res
111 | 


--------------------------------------------------------------------------------
/davis2017/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import errno
  3 | import numpy as np
  4 | from PIL import Image
  5 | import warnings
  6 | from davis2017.davis import DAVIS
  7 | 
  8 | 
  9 | def _pascal_color_map(N=256, normalized=False):
 10 |     """
 11 |     Python implementation of the color map function for the PASCAL VOC data set.
 12 |     Official Matlab version can be found in the PASCAL VOC devkit
 13 |     http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit
 14 |     """
 15 | 
 16 |     def bitget(byteval, idx):
 17 |         return (byteval & (1 << idx)) != 0
 18 | 
 19 |     dtype = 'float32' if normalized else 'uint8'
 20 |     cmap = np.zeros((N, 3), dtype=dtype)
 21 |     for i in range(N):
 22 |         r = g = b = 0
 23 |         c = i
 24 |         for j in range(8):
 25 |             r = r | (bitget(c, 0) << 7 - j)
 26 |             g = g | (bitget(c, 1) << 7 - j)
 27 |             b = b | (bitget(c, 2) << 7 - j)
 28 |             c = c >> 3
 29 | 
 30 |         cmap[i] = np.array([r, g, b])
 31 | 
 32 |     cmap = cmap / 255 if normalized else cmap
 33 |     return cmap
 34 | 
 35 | 
 36 | def overlay_semantic_mask(im, ann, alpha=0.5, colors=None, contour_thickness=None):
 37 |     im, ann = np.asarray(im, dtype=np.uint8), np.asarray(ann, dtype=np.int)
 38 |     if im.shape[:-1] != ann.shape:
 39 |         raise ValueError('First two dimensions of `im` and `ann` must match')
 40 |     if im.shape[-1] != 3:
 41 |         raise ValueError('im must have three channels at the 3 dimension')
 42 | 
 43 |     colors = colors or _pascal_color_map()
 44 |     colors = np.asarray(colors, dtype=np.uint8)
 45 | 
 46 |     mask = colors[ann]
 47 |     fg = im * alpha + (1 - alpha) * mask
 48 | 
 49 |     img = im.copy()
 50 |     img[ann > 0] = fg[ann > 0]
 51 | 
 52 |     if contour_thickness:  # pragma: no cover
 53 |         import cv2
 54 |         for obj_id in np.unique(ann[ann > 0]):
 55 |             contours = cv2.findContours((ann == obj_id).astype(
 56 |                 np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[-2:]
 57 |             cv2.drawContours(img, contours[0], -1, colors[obj_id].tolist(),
 58 |                              contour_thickness)
 59 |     return img
 60 | 
 61 | 
 62 | def generate_obj_proposals(davis_root, subset, num_proposals, save_path):
 63 |     dataset = DAVIS(davis_root, subset=subset, codalab=True)
 64 |     for seq in dataset.get_sequences():
 65 |         save_dir = os.path.join(save_path, seq)
 66 |         if os.path.exists(save_dir):
 67 |             continue
 68 |         all_gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
 69 |         img_size = all_gt_masks.shape[2:]
 70 |         num_rows = int(np.ceil(np.sqrt(num_proposals)))
 71 |         proposals = np.zeros((num_proposals, len(all_masks_id), *img_size))
 72 |         height_slices = np.floor(np.arange(0, img_size[0] + 1, img_size[0]/num_rows)).astype(np.uint).tolist()
 73 |         width_slices = np.floor(np.arange(0, img_size[1] + 1, img_size[1]/num_rows)).astype(np.uint).tolist()
 74 |         ii = 0
 75 |         prev_h, prev_w = 0, 0
 76 |         for h in height_slices[1:]:
 77 |             for w in width_slices[1:]:
 78 |                 proposals[ii, :, prev_h:h, prev_w:w] = 1
 79 |                 prev_w = w
 80 |                 ii += 1
 81 |                 if ii == num_proposals:
 82 |                     break
 83 |             prev_h, prev_w = h, 0
 84 |             if ii == num_proposals:
 85 |                 break
 86 | 
 87 |         os.makedirs(save_dir, exist_ok=True)
 88 |         for i, mask_id in enumerate(all_masks_id):
 89 |             mask = np.sum(proposals[:, i, ...] * np.arange(1, proposals.shape[0] + 1)[:, None, None], axis=0)
 90 |             save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
 91 | 
 92 | 
 93 | def generate_random_permutation_gt_obj_proposals(davis_root, subset, save_path):
 94 |     dataset = DAVIS(davis_root, subset=subset, codalab=True)
 95 |     for seq in dataset.get_sequences():
 96 |         gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
 97 |         obj_swap = np.random.permutation(np.arange(gt_masks.shape[0]))
 98 |         gt_masks = gt_masks[obj_swap, ...]
 99 |         save_dir = os.path.join(save_path, seq)
100 |         os.makedirs(save_dir, exist_ok=True)
101 |         for i, mask_id in enumerate(all_masks_id):
102 |             mask = np.sum(gt_masks[:, i, ...] * np.arange(1, gt_masks.shape[0] + 1)[:, None, None], axis=0)
103 |             save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
104 | 
105 | 
106 | def color_map(N=256, normalized=False):
107 |     def bitget(byteval, idx):
108 |         return ((byteval & (1 << idx)) != 0)
109 | 
110 |     dtype = 'float32' if normalized else 'uint8'
111 |     cmap = np.zeros((N, 3), dtype=dtype)
112 |     for i in range(N):
113 |         r = g = b = 0
114 |         c = i
115 |         for j in range(8):
116 |             r = r | (bitget(c, 0) << 7-j)
117 |             g = g | (bitget(c, 1) << 7-j)
118 |             b = b | (bitget(c, 2) << 7-j)
119 |             c = c >> 3
120 | 
121 |         cmap[i] = np.array([r, g, b])
122 | 
123 |     cmap = cmap/255 if normalized else cmap
124 |     return cmap
125 | 
126 | 
127 | def save_mask(mask, img_path):
128 |     if np.max(mask) > 255:
129 |         raise ValueError('Maximum id pixel value is 255')
130 |     mask_img = Image.fromarray(mask.astype(np.uint8))
131 |     mask_img.putpalette(color_map().flatten().tolist())
132 |     mask_img.save(img_path)
133 | 
134 | 
135 | def db_statistics(per_frame_values):
136 |     """ Compute mean,recall and decay from per-frame evaluation.
137 |     Arguments:
138 |         per_frame_values (ndarray): per-frame evaluation
139 | 
140 |     Returns:
141 |         M,O,D (float,float,float):
142 |             return evaluation statistics: mean,recall,decay.
143 |     """
144 | 
145 |     # strip off nan values
146 |     with warnings.catch_warnings():
147 |         warnings.simplefilter("ignore", category=RuntimeWarning)
148 |         M = np.nanmean(per_frame_values)
149 |         O = np.nanmean(per_frame_values > 0.5)
150 | 
151 |     N_bins = 4
152 |     ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1
153 |     ids = ids.astype(np.uint8)
154 | 
155 |     D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)]
156 | 
157 |     with warnings.catch_warnings():
158 |         warnings.simplefilter("ignore", category=RuntimeWarning)
159 |         D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3])
160 | 
161 |     return M, O, D
162 | 
163 | 
164 | def list_files(dir, extension=".png"):
165 |     return [os.path.splitext(file_)[0] for file_ in os.listdir(dir) if file_.endswith(extension)]
166 | 
167 | 
168 | def force_symlink(file1, file2):
169 |     try:
170 |         os.symlink(file1, file2)
171 |     except OSError as e:
172 |         if e.errno == errno.EEXIST:
173 |             os.remove(file2)
174 |         os.symlink(file1, file2)
175 | 


--------------------------------------------------------------------------------
/pre_process/sim_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torchvision
  4 | try:
  5 |     from frozen_batchnorm2d import FrozenBatchNorm2d
  6 | except:
  7 |     from .frozen_batchnorm2d import FrozenBatchNorm2d
  8 | from torchvision.models._utils import IntermediateLayerGetter
  9 | import math
 10 | import numpy as np
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | class SimModel(nn.Module):
 15 |     def __init__(self, backbone, dilation=False, background_thres=0.5):
 16 |         super().__init__()
 17 |         self.background_thres = background_thres
 18 |         backbone = getattr(torchvision.models, backbone)(
 19 |             replace_stride_with_dilation=[False, False, dilation],
 20 |             pretrained=True,
 21 |             norm_layer=FrozenBatchNorm2d,
 22 |         )
 23 |         return_layers = {
 24 |             "layer1": "feat1",
 25 |             "layer2": "feat2",
 26 |             "layer3": "feat3",
 27 |             "layer4": "feat4",
 28 |         }
 29 |         self.backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
 30 |         checkpoint_path = "checkpoints/densecl_r101_imagenet_200ep.pth"
 31 |         self.backbone.load_state_dict(
 32 |             torch.load(checkpoint_path)["state_dict"], strict=False
 33 |         )
 34 | 
 35 |     def forward_point(self, x, point_list, valid):
 36 |         fea = self.backbone(x)["feat4"]
 37 |         keys = fea  ### [B, C3, H3, W3]
 38 |         b, c, h, w = keys.shape
 39 |         out_masks = []
 40 |         for n_p, point_loc in enumerate(point_list):
 41 |             if valid[n_p]:
 42 |                 scale_factors = [1.0]
 43 |                 queries_list = []
 44 |                 for scale_factor in scale_factors:
 45 |                     point_cur = [
 46 |                         int(point_loc[0] * w * scale_factor),
 47 |                         int(point_loc[1] * h * scale_factor),
 48 |                     ]
 49 |                     cur_queries = keys[:, :, point_cur[1], point_cur[0]]
 50 |                     queries_list.append(cur_queries)
 51 | 
 52 |                 queries = torch.stack(queries_list, dim=1)
 53 | 
 54 |                 keys = keys / keys.norm(dim=1, keepdim=True)
 55 |                 queries = queries / queries.norm(dim=-1, keepdim=True)
 56 |                 attn = torch.matmul(queries, keys.view(b, c, -1))
 57 |                 attn = (attn - attn.min(-1, keepdim=True)[0]) / attn.max(
 58 |                     -1, keepdim=True
 59 |                 )[0]
 60 |                 soft_masks = attn.reshape(b, attn.shape[1], h, w)
 61 |                 out_masks.append(soft_masks)
 62 |             else:
 63 |                 out_masks.append(torch.zeros((b, 1, h, w)).to(x.device))
 64 |         out_masks = torch.cat(out_masks, dim=1)
 65 |         return out_masks
 66 | 
 67 |     def forward_bbox(self, x, bbox_list, valid):
 68 |         h_ori, w_ori = x.shape[-2:]
 69 |         fea = self.backbone(x)["feat4"]
 70 |         keys = fea  ### [B, C3, H3, W3]
 71 |         b, c, h, w = keys.shape
 72 |         out_masks = []
 73 |         for n_b, bbox in enumerate(bbox_list):
 74 |             if valid[n_b]:
 75 | 
 76 |                 scale_factors = [1.0]
 77 |                 queries_list = []
 78 |                 bbox_masks = []
 79 |                 for scale_factor in scale_factors:
 80 |                     box_cur = [
 81 |                         int(bbox[0] / w_ori * w * scale_factor),
 82 |                         int(bbox[1] / h_ori * h * scale_factor),
 83 |                         int(bbox[2] / w_ori * w * scale_factor),
 84 |                         int(bbox[3] / h_ori * h * scale_factor),
 85 |                     ]
 86 |                     bbox_mask = torch.zeros((h, w)).bool().to(x.device)
 87 |                     bbox_mask[box_cur[1] : box_cur[3], box_cur[0] : box_cur[2]] = True
 88 |                     range_x = list(range(box_cur[0], box_cur[2] + 1))
 89 |                     range_y = list(range(box_cur[1], box_cur[3] + 1))
 90 |                     i = 1
 91 |                     while(len(range_x) * len(range_y) > 256):
 92 |                         range_x = list(range(box_cur[0], box_cur[2] + 1, i+1))
 93 |                         range_y = list(range(box_cur[1], box_cur[3] + 1, i+1))
 94 |                         i += 1
 95 |                     x_candi = torch.tensor(range_x)
 96 |                     y_candi = torch.tensor(range_y)
 97 |                     gridx, gridy = torch.meshgrid(x_candi, y_candi)
 98 |                     locs = torch.stack([gridx, gridy], dim=-1).flatten(0, 1)  # [N, 2]
 99 |                     for loc in locs:
100 |                         cur_queries = keys[:, :, loc[1], loc[0]]
101 |                         queries_list.append(cur_queries)
102 |                         bbox_masks.append(bbox_mask)
103 |                 queries = torch.stack(queries_list, dim=1)  # [b, n, d]
104 |                 bbox_masks = torch.stack(bbox_masks, dim=0)[None]  # [1, n, h, w]
105 |                 bbox_masks_flatten = bbox_masks.flatten(-2)
106 | 
107 |                 keys = keys / keys.norm(dim=1, keepdim=True)
108 |                 queries = queries / queries.norm(dim=-1, keepdim=True)
109 |                 attn = torch.matmul(queries, keys.view(b, c, -1))
110 |                 attn = (attn - attn.min(-1, keepdim=True)[0]) / attn.max(
111 |                     -1, keepdim=True
112 |                 )[0]
113 | 
114 |                 attn_reshape = attn.reshape(b, attn.shape[1], h, w)
115 | 
116 |                 attn_scale = attn_reshape
117 |                 attn_x = attn_scale.max(dim=-2)[0]
118 |                 attn_y = attn_scale.max(dim=-1)[0]
119 | 
120 |                 score_x = (attn_x * bbox_masks.max(dim=-2)[0]).sum(dim=-1) / ((attn_x + bbox_masks.max(dim=-2)[0] - attn_x * bbox_masks.max(dim=-2)[0]).sum(dim=-1) + 1e-5)
121 |                 score_y = (attn_y * bbox_masks.max(dim=-1)[0]).sum(dim=-1) / ((attn_y + bbox_masks.max(dim=-1)[0] - attn_y * bbox_masks.max(dim=-1)[0]).sum(dim=-1) + 1e-5)
122 |                 score = (score_x + score_y) / 2
123 | 
124 |                 _, max_loc = torch.topk(score, 1, 1)
125 |                 attn_selected = torch.gather(
126 |                     attn, 1, max_loc.unsqueeze(-1).repeat(1, 1, attn.shape[-1])
127 |                 )
128 |                 
129 |                 soft_masks = attn_selected.reshape(b, attn_selected.shape[1], h, w)
130 |                 out_masks.append(soft_masks)
131 |             else:
132 |                 out_masks.append(torch.zeros((b, 1, h, w)).to(x.device))
133 |         out_masks = torch.cat(out_masks, dim=1)
134 |         return out_masks
135 | 
136 |     def forward(self, x, query_list, valid, mode="point"):
137 |         if mode == "point":
138 |             out_masks = self.forward_point(x, query_list, valid)
139 |         elif mode == "bbox":
140 |             out_masks = self.forward_bbox(x, query_list, valid)
141 |         return out_masks
142 | 


--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various positional encodings for the transformer.
  3 | Modified from DETR (https://github.com/facebookresearch/detr)
  4 | """
  5 | import math
  6 | import torch
  7 | from torch import nn
  8 | 
  9 | from util.misc import NestedTensor
 10 | 
 11 | # dimension == 1
 12 | class PositionEmbeddingSine1D(nn.Module):
 13 |     """
 14 |     This is a more standard version of the position embedding, very similar to the one
 15 |     used by the Attention is all you need paper, generalized to work on images.
 16 |     """
 17 |     def __init__(self, num_pos_feats=256, temperature=10000, normalize=False, scale=None):
 18 |         super().__init__()
 19 |         self.num_pos_feats = num_pos_feats
 20 |         self.temperature = temperature
 21 |         self.normalize = normalize
 22 |         if scale is not None and normalize is False:
 23 |             raise ValueError("normalize should be True if scale is passed")
 24 |         if scale is None:
 25 |             scale = 2 * math.pi
 26 |         self.scale = scale
 27 | 
 28 |     def forward(self, tensor_list: NestedTensor):
 29 |         x = tensor_list.tensors # [B, C, T]
 30 |         mask = tensor_list.mask # [B, T]
 31 |         assert mask is not None
 32 |         not_mask = ~mask
 33 |         x_embed = not_mask.cumsum(1, dtype=torch.float32)  # [B, T]
 34 |         if self.normalize:
 35 |             eps = 1e-6
 36 |             x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale
 37 | 
 38 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 39 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 40 | 
 41 |         pos_x = x_embed[:, :, None] / dim_t  # [B, T, C]
 42 |         # n,c,t
 43 |         pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
 44 |         pos = pos_x.permute(0, 2, 1)    # [B, C, T]
 45 |         return pos
 46 | 
 47 | # dimension == 2
 48 | class PositionEmbeddingSine2D(nn.Module):
 49 |     """
 50 |     This is a more standard version of the position embedding, very similar to the one
 51 |     used by the Attention is all you need paper, generalized to work on images.
 52 |     """
 53 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 54 |         super().__init__()
 55 |         self.num_pos_feats = num_pos_feats
 56 |         self.temperature = temperature
 57 |         self.normalize = normalize
 58 |         if scale is not None and normalize is False:
 59 |             raise ValueError("normalize should be True if scale is passed")
 60 |         if scale is None:
 61 |             scale = 2 * math.pi
 62 |         self.scale = scale
 63 | 
 64 |     def forward(self, tensor_list: NestedTensor):
 65 |         x = tensor_list.tensors  # [B, C, H, W]
 66 |         mask = tensor_list.mask  # [B, H, W]
 67 |         assert mask is not None
 68 |         not_mask = ~mask
 69 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 70 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 71 |         if self.normalize:
 72 |             eps = 1e-6
 73 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
 74 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 75 | 
 76 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 77 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 78 | 
 79 |         pos_x = x_embed[:, :, :, None] / dim_t
 80 |         pos_y = y_embed[:, :, :, None] / dim_t
 81 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 82 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 83 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 84 |         return pos  # [B, C, H, W]
 85 | 
 86 | 
 87 | # dimension == 3
 88 | class PositionEmbeddingSine3D(nn.Module):
 89 |     """
 90 |     This is a more standard version of the position embedding, very similar to the one
 91 |     used by the Attention is all you need paper, generalized to work on images.
 92 |     """
 93 |     def __init__(self, num_pos_feats=64, num_frames=36, temperature=10000, normalize=False, scale=None):
 94 |         super().__init__()
 95 |         self.num_pos_feats = num_pos_feats
 96 |         self.temperature = temperature
 97 |         self.normalize = normalize
 98 |         self.frames = num_frames
 99 |         if scale is not None and normalize is False:
100 |             raise ValueError("normalize should be True if scale is passed")
101 |         if scale is None:
102 |             scale = 2 * math.pi
103 |         self.scale = scale
104 | 
105 |     def forward(self, tensor_list: NestedTensor):
106 |         x = tensor_list.tensors # [B*T, C, H, W]
107 |         mask = tensor_list.mask # [B*T, H, W]
108 |         n,h,w = mask.shape
109 |         mask = mask.reshape(n//self.frames, self.frames,h,w) # [B, T, H, W]
110 |         assert mask is not None
111 |         not_mask = ~mask
112 |         z_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, T, H, W]
113 |         y_embed = not_mask.cumsum(2, dtype=torch.float32) # [B, T, H, W]
114 |         x_embed = not_mask.cumsum(3, dtype=torch.float32) # [B, T, H, W]
115 |         if self.normalize:
116 |             eps = 1e-6
117 |             z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
118 |             y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
119 |             x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
120 | 
121 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) #
122 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
123 | 
124 |         pos_x = x_embed[:, :, :, :, None] / dim_t # [B, T, H, W, c]
125 |         pos_y = y_embed[:, :, :, :, None] / dim_t
126 |         pos_z = z_embed[:, :, :, :, None] / dim_t
127 |         pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) # [B, T, H, W, c]
128 |         pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
129 |         pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
130 |         pos = torch.cat((pos_z, pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3) # [B, T, C, H, W]
131 |         return pos
132 | 
133 | 
134 | 
135 | def build_position_encoding(args):
136 |     # build 2D position encoding
137 |     N_steps = args.hidden_dim // 2  # 256 / 2 = 128
138 |     if args.position_embedding in ('v2', 'sine'):
139 |         # TODO find a better way of exposing other arguments
140 |         position_embedding = PositionEmbeddingSine2D(N_steps, normalize=True)
141 |     else:
142 |         raise ValueError(f"not supported {args.position_embedding}")
143 | 
144 |     return position_embedding
145 | 


--------------------------------------------------------------------------------
/models/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # Modify for sample points visualization
  2 | # ------------------------------------------------------------------------------------------------
  3 | # Deformable DETR
  4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | # ------------------------------------------------------------------------------------------------
  7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | # ------------------------------------------------------------------------------------------------
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import print_function
 12 | from __future__ import division
 13 | 
 14 | import warnings
 15 | import math
 16 | 
 17 | import torch
 18 | from torch import nn
 19 | import torch.nn.functional as F
 20 | from torch.nn.init import xavier_uniform_, constant_
 21 | 
 22 | from ..functions import MSDeformAttnFunction
 23 | 
 24 | 
 25 | def _is_power_of_2(n):
 26 |     if (not isinstance(n, int)) or (n < 0):
 27 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 28 |     return (n & (n-1) == 0) and n != 0
 29 | 
 30 | 
 31 | class MSDeformAttn(nn.Module):
 32 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 33 |         """
 34 |         Multi-Scale Deformable Attention Module
 35 |         :param d_model      hidden dimension
 36 |         :param n_levels     number of feature levels
 37 |         :param n_heads      number of attention heads
 38 |         :param n_points     number of sampling points per attention head per feature level
 39 |         """
 40 |         super().__init__()
 41 |         if d_model % n_heads != 0:
 42 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 43 |         _d_per_head = d_model // n_heads
 44 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 45 |         if not _is_power_of_2(_d_per_head):
 46 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 47 |                           "which is more efficient in our CUDA implementation.")
 48 | 
 49 |         self.im2col_step = 64
 50 | 
 51 |         self.d_model = d_model
 52 |         self.n_levels = n_levels
 53 |         self.n_heads = n_heads
 54 |         self.n_points = n_points
 55 | 
 56 |         # res = sum(attn * W*(delta p))
 57 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)  # delta p
 58 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)  # attn
 59 |         self.value_proj = nn.Linear(d_model, d_model)
 60 |         self.output_proj = nn.Linear(d_model, d_model)
 61 | 
 62 |         self._reset_parameters()
 63 | 
 64 |     def _reset_parameters(self):
 65 |         constant_(self.sampling_offsets.weight.data, 0.)
 66 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 67 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 68 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 69 |         for i in range(self.n_points):
 70 |             grid_init[:, :, i, :] *= i + 1
 71 |         with torch.no_grad():
 72 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 73 |         constant_(self.attention_weights.weight.data, 0.)
 74 |         constant_(self.attention_weights.bias.data, 0.)
 75 |         xavier_uniform_(self.value_proj.weight.data)
 76 |         constant_(self.value_proj.bias.data, 0.)
 77 |         xavier_uniform_(self.output_proj.weight.data)
 78 |         constant_(self.output_proj.bias.data, 0.)
 79 | 
 80 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 81 |         """
 82 |         :param query                       (N, Length_{query}, C)
 83 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 84 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 85 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 86 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 87 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 88 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 89 | 
 90 |         :return output                     (N, Length_{query}, C)
 91 |         """
 92 |         N, Len_q, _ = query.shape
 93 |         N, Len_in, _ = input_flatten.shape
 94 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 95 | 
 96 |         value = self.value_proj(input_flatten)
 97 |         if input_padding_mask is not None:
 98 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 99 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
100 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
101 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
102 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
103 |         # N, Len_q, n_heads, n_levels, n_points, 2
104 |         if reference_points.shape[-1] == 2:
105 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
106 |             sampling_locations = reference_points[:, :, None, :, None, :] \
107 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
108 |         elif reference_points.shape[-1] == 4:
109 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
110 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
111 |         else:
112 |             raise ValueError(
113 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
114 |         output = MSDeformAttnFunction.apply(
115 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
116 |         output = self.output_proj(output)
117 | 
118 |         return output, sampling_locations, attention_weights
119 | 


--------------------------------------------------------------------------------
/models/ops/build/lib.linux-x86_64-cpython-38/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # Modify for sample points visualization
  2 | # ------------------------------------------------------------------------------------------------
  3 | # Deformable DETR
  4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | # ------------------------------------------------------------------------------------------------
  7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | # ------------------------------------------------------------------------------------------------
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import print_function
 12 | from __future__ import division
 13 | 
 14 | import warnings
 15 | import math
 16 | 
 17 | import torch
 18 | from torch import nn
 19 | import torch.nn.functional as F
 20 | from torch.nn.init import xavier_uniform_, constant_
 21 | 
 22 | from ..functions import MSDeformAttnFunction
 23 | 
 24 | 
 25 | def _is_power_of_2(n):
 26 |     if (not isinstance(n, int)) or (n < 0):
 27 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 28 |     return (n & (n-1) == 0) and n != 0
 29 | 
 30 | 
 31 | class MSDeformAttn(nn.Module):
 32 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 33 |         """
 34 |         Multi-Scale Deformable Attention Module
 35 |         :param d_model      hidden dimension
 36 |         :param n_levels     number of feature levels
 37 |         :param n_heads      number of attention heads
 38 |         :param n_points     number of sampling points per attention head per feature level
 39 |         """
 40 |         super().__init__()
 41 |         if d_model % n_heads != 0:
 42 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 43 |         _d_per_head = d_model // n_heads
 44 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 45 |         if not _is_power_of_2(_d_per_head):
 46 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 47 |                           "which is more efficient in our CUDA implementation.")
 48 | 
 49 |         self.im2col_step = 64
 50 | 
 51 |         self.d_model = d_model
 52 |         self.n_levels = n_levels
 53 |         self.n_heads = n_heads
 54 |         self.n_points = n_points
 55 | 
 56 |         # res = sum(attn * W*(delta p))
 57 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)  # delta p
 58 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)  # attn
 59 |         self.value_proj = nn.Linear(d_model, d_model)
 60 |         self.output_proj = nn.Linear(d_model, d_model)
 61 | 
 62 |         self._reset_parameters()
 63 | 
 64 |     def _reset_parameters(self):
 65 |         constant_(self.sampling_offsets.weight.data, 0.)
 66 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 67 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 68 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 69 |         for i in range(self.n_points):
 70 |             grid_init[:, :, i, :] *= i + 1
 71 |         with torch.no_grad():
 72 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 73 |         constant_(self.attention_weights.weight.data, 0.)
 74 |         constant_(self.attention_weights.bias.data, 0.)
 75 |         xavier_uniform_(self.value_proj.weight.data)
 76 |         constant_(self.value_proj.bias.data, 0.)
 77 |         xavier_uniform_(self.output_proj.weight.data)
 78 |         constant_(self.output_proj.bias.data, 0.)
 79 | 
 80 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 81 |         """
 82 |         :param query                       (N, Length_{query}, C)
 83 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 84 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 85 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 86 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 87 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 88 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 89 | 
 90 |         :return output                     (N, Length_{query}, C)
 91 |         """
 92 |         N, Len_q, _ = query.shape
 93 |         N, Len_in, _ = input_flatten.shape
 94 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 95 | 
 96 |         value = self.value_proj(input_flatten)
 97 |         if input_padding_mask is not None:
 98 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 99 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
100 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
101 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
102 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
103 |         # N, Len_q, n_heads, n_levels, n_points, 2
104 |         if reference_points.shape[-1] == 2:
105 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
106 |             sampling_locations = reference_points[:, :, None, :, None, :] \
107 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
108 |         elif reference_points.shape[-1] == 4:
109 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
110 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
111 |         else:
112 |             raise ValueError(
113 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
114 |         output = MSDeformAttnFunction.apply(
115 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
116 |         output = self.output_proj(output)
117 | 
118 |         return output, sampling_locations, attention_weights
119 | 


--------------------------------------------------------------------------------
/datasets/refexp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | """
  4 | COCO dataset which returns image_id for evaluation.
  5 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
  6 | """
  7 | from pathlib import Path
  8 | 
  9 | import torch
 10 | import torch.utils.data
 11 | import torchvision
 12 | from pycocotools import mask as coco_mask
 13 | 
 14 | import datasets.transforms_image as T
 15 | 
 16 | 
 17 | class ModulatedDetection(torchvision.datasets.CocoDetection):
 18 |     def __init__(self, img_folder, ann_file, transforms, return_masks):
 19 |         super(ModulatedDetection, self).__init__(img_folder, ann_file)
 20 |         self._transforms = transforms
 21 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 22 | 
 23 |     def __getitem__(self, idx):
 24 |         instance_check = False
 25 |         while not instance_check:
 26 |             img, target = super(ModulatedDetection, self).__getitem__(idx)
 27 |             image_id = self.ids[idx]
 28 |             coco_img = self.coco.loadImgs(image_id)[0]
 29 |             caption = coco_img["caption"]
 30 |             dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None
 31 |             target = {"image_id": image_id, "annotations": target, "caption": caption}
 32 |             img, target = self.prepare(img, target)
 33 |             if self._transforms is not None:
 34 |                 img, target = self._transforms(img, target)
 35 |             target["dataset_name"] = dataset_name
 36 |             for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]:
 37 |                 if extra_key in coco_img:
 38 |                     target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh
 39 |             # FIXME: handle "valid", since some box may be removed due to random crop
 40 |             target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0])
 41 | 
 42 |             if torch.any(target['valid'] == 1):  # at leatst one instance
 43 |                 instance_check = True
 44 |             else:
 45 |                 import random
 46 |                 idx = random.randint(0, self.__len__() - 1)
 47 |         return img.unsqueeze(0), target
 48 |         # return img: [1, 3, H, W], the first dimension means T = 1.
 49 | 
 50 | 
 51 | def convert_coco_poly_to_mask(segmentations, height, width):
 52 |     masks = []
 53 |     for polygons in segmentations:
 54 |         rles = coco_mask.frPyObjects(polygons, height, width)
 55 |         mask = coco_mask.decode(rles)
 56 |         if len(mask.shape) < 3:
 57 |             mask = mask[..., None]
 58 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 59 |         mask = mask.any(dim=2)
 60 |         masks.append(mask)
 61 |     if masks:
 62 |         masks = torch.stack(masks, dim=0)
 63 |     else:
 64 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 65 |     return masks
 66 | 
 67 | 
 68 | class ConvertCocoPolysToMask(object):
 69 |     def __init__(self, return_masks=False):
 70 |         self.return_masks = return_masks
 71 | 
 72 |     def __call__(self, image, target):
 73 |         w, h = image.size
 74 | 
 75 |         image_id = target["image_id"]
 76 |         image_id = torch.tensor([image_id])
 77 | 
 78 |         anno = target["annotations"]
 79 |         caption = target["caption"] if "caption" in target else None
 80 | 
 81 |         anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
 82 | 
 83 |         boxes = [obj["bbox"] for obj in anno]
 84 |         # guard against no boxes via resizing
 85 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
 86 |         boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy
 87 |         boxes[:, 0::2].clamp_(min=0, max=w)
 88 |         boxes[:, 1::2].clamp_(min=0, max=h)
 89 | 
 90 |         classes = [obj["category_id"] for obj in anno]
 91 |         classes = torch.tensor(classes, dtype=torch.int64)
 92 | 
 93 |         if self.return_masks:
 94 |             segmentations = [obj["segmentation"] for obj in anno]
 95 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
 96 | 
 97 |         # keep the valid boxes
 98 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
 99 |         boxes = boxes[keep]
100 |         classes = classes[keep]
101 |         if self.return_masks:
102 |             masks = masks[keep]
103 | 
104 |         target = {}
105 |         target["boxes"] = boxes
106 |         target["labels"] = classes
107 |         if caption is not None:
108 |             target["caption"] = caption
109 |         if self.return_masks:
110 |             target["masks"] = masks
111 |         target["image_id"] = image_id
112 | 
113 |         # for conversion to coco api
114 |         area = torch.tensor([obj["area"] for obj in anno])
115 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
116 |         target["area"] = area[keep]
117 |         target["iscrowd"] = iscrowd[keep]
118 |         target["valid"] = torch.tensor([1])
119 |         target["orig_size"] = torch.as_tensor([int(h), int(w)])
120 |         target["size"] = torch.as_tensor([int(h), int(w)])
121 |         return image, target
122 | 
123 | 
124 | def make_coco_transforms(image_set, cautious):
125 | 
126 |     normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
127 | 
128 |     scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
129 |     final_scales = [296, 328, 360, 392, 416, 448, 480, 512]
130 | 
131 |     max_size = 800
132 |     if image_set == "train":
133 |         horizontal = [] if cautious else [T.RandomHorizontalFlip()]
134 |         return T.Compose(
135 |             horizontal
136 |             + [
137 |                 T.RandomSelect(
138 |                     T.RandomResize(scales, max_size=max_size),
139 |                     T.Compose(
140 |                         [
141 |                             T.RandomResize([400, 500, 600]),
142 |                             T.RandomSizeCrop(384, 600, respect_boxes=cautious),
143 |                             T.RandomResize(final_scales, max_size=640),
144 |                         ]
145 |                     ),
146 |                 ),
147 |                 normalize,
148 |             ]
149 |         )
150 | 
151 |     if image_set == "val":
152 |         return T.Compose(
153 |             [
154 |                 T.RandomResize([360], max_size=640),
155 |                 normalize,
156 |             ]
157 |         )
158 | 
159 |     raise ValueError(f"unknown {image_set}")
160 | 
161 | 
162 | def build(dataset_file, image_set, args):
163 |     root = Path(args.coco_path)
164 |     assert root.exists(), f"provided COCO path {root} does not exist"
165 |     mode = "instances"
166 |     dataset = dataset_file
167 |     PATHS = {
168 |         "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"),
169 |         "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"),
170 |     }
171 | 
172 |     img_folder, ann_file = PATHS[image_set]
173 |     dataset = ModulatedDetection(
174 |         img_folder,
175 |         ann_file,
176 |         transforms=make_coco_transforms(image_set, False),
177 |         return_masks=args.masks,
178 |     )
179 |     return dataset


--------------------------------------------------------------------------------