├── .gitignore
├── Adabins
    ├── README.md
    ├── dataloader.py
    ├── demo.py
    ├── evaluate.py
    ├── infer.py
    ├── loss.py
    ├── model_io.py
    ├── models
    │   ├── __init__.py
    │   ├── layers.py
    │   ├── miniViT.py
    │   └── unet_adaptive_bins.py
    ├── train.py
    └── utils.py
├── DPT
    ├── README.md
    ├── dpt
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── blocks.py
    │   ├── midas_net.py
    │   ├── models.py
    │   ├── transforms.py
    │   └── vit.py
    ├── requirements.txt
    ├── run_monodepth.py
    └── util
    │   ├── __init__.py
    │   ├── io.py
    │   ├── misc.py
    │   └── pallete.py
├── Decompose
    ├── README.md
    ├── dataloader.py
    ├── demo.py
    ├── eval.py
    ├── evaluation.py
    ├── models
    │   ├── ViT.py
    │   ├── models.py
    │   └── upsample.py
    ├── split_files.txt
    └── utils.py
├── DepthAnything
    ├── LICENSE
    ├── README.md
    ├── app.py
    ├── controlnet
    │   ├── README.md
    │   └── config.json
    ├── depth_anything
    │   ├── blocks.py
    │   ├── dpt.py
    │   └── util
    │   │   └── transform.py
    ├── metric_depth
    │   ├── checkpoints
    │   │   └── .placeholder
    │   ├── demo.py
    │   ├── environment.yml
    │   ├── evaluate.py
    │   ├── train_mix.py
    │   ├── train_mono.py
    │   ├── train_test_inputs
    │   │   ├── kitti_eigen_test_files_with_gt.txt
    │   │   ├── kitti_eigen_train_files_with_gt.txt
    │   │   ├── nyudepthv2_test_files_with_gt.txt
    │   │   └── nyudepthv2_train_files_with_gt.txt
    │   └── zoedepth
    │   │   ├── data
    │   │       ├── __init__.py
    │   │       ├── data_mono.py
    │   │       ├── ddad.py
    │   │       ├── diml_indoor_test.py
    │   │       ├── diml_outdoor_test.py
    │   │       ├── diode.py
    │   │       ├── hypersim.py
    │   │       ├── ibims.py
    │   │       ├── preprocess.py
    │   │       ├── sun_rgbd_loader.py
    │   │       ├── transforms.py
    │   │       ├── vkitti.py
    │   │       └── vkitti2.py
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── base_models
    │   │       │   ├── __init__.py
    │   │       │   ├── depth_anything.py
    │   │       │   ├── dpt_dinov2
    │   │       │   │   ├── blocks.py
    │   │       │   │   └── dpt.py
    │   │       │   └── midas.py
    │   │       ├── builder.py
    │   │       ├── depth_model.py
    │   │       ├── layers
    │   │       │   ├── attractor.py
    │   │       │   ├── dist_layers.py
    │   │       │   ├── localbins_layers.py
    │   │       │   └── patch_transformer.py
    │   │       ├── model_io.py
    │   │       ├── zoedepth
    │   │       │   ├── __init__.py
    │   │       │   ├── config_zoedepth.json
    │   │       │   ├── config_zoedepth_kitti.json
    │   │       │   └── zoedepth_v1.py
    │   │       └── zoedepth_nk
    │   │       │   ├── __init__.py
    │   │       │   ├── config_zoedepth_nk.json
    │   │       │   └── zoedepth_nk_v1.py
    │   │   ├── trainers
    │   │       ├── base_trainer.py
    │   │       ├── builder.py
    │   │       ├── loss.py
    │   │       ├── zoedepth_nk_trainer.py
    │   │       └── zoedepth_trainer.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── arg_utils.py
    │   │       ├── config.py
    │   │       ├── easydict
    │   │           └── __init__.py
    │   │       ├── geometry.py
    │   │       └── misc.py
    ├── requirements.txt
    ├── run.py
    ├── run_video.py
    ├── semseg
    │   ├── README.md
    │   ├── config
    │   │   └── depth_anything
    │   │   │   ├── depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
    │   │   │   ├── depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
    │   │   │   └── depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
    │   └── dinov2.py
    └── torchhub
    │   ├── README.md
    │   └── facebookresearch_dinov2_main
    │       ├── CODE_OF_CONDUCT.md
    │       ├── CONTRIBUTING.md
    │       ├── LICENSE
    │       ├── MODEL_CARD.md
    │       ├── README.md
    │       ├── conda.yaml
    │       ├── dinov2
    │           ├── __init__.py
    │           ├── configs
    │           │   ├── __init__.py
    │           │   ├── eval
    │           │   │   ├── vitb14_pretrain.yaml
    │           │   │   ├── vitg14_pretrain.yaml
    │           │   │   ├── vitl14_pretrain.yaml
    │           │   │   └── vits14_pretrain.yaml
    │           │   ├── ssl_default_config.yaml
    │           │   └── train
    │           │   │   ├── vitg14.yaml
    │           │   │   ├── vitl14.yaml
    │           │   │   └── vitl16_short.yaml
    │           ├── data
    │           │   ├── __init__.py
    │           │   ├── adapters.py
    │           │   ├── augmentations.py
    │           │   ├── collate.py
    │           │   ├── datasets
    │           │   │   ├── __init__.py
    │           │   │   ├── decoders.py
    │           │   │   ├── extended.py
    │           │   │   ├── image_net.py
    │           │   │   └── image_net_22k.py
    │           │   ├── loaders.py
    │           │   ├── masking.py
    │           │   ├── samplers.py
    │           │   └── transforms.py
    │           ├── distributed
    │           │   └── __init__.py
    │           ├── eval
    │           │   ├── __init__.py
    │           │   ├── knn.py
    │           │   ├── linear.py
    │           │   ├── log_regression.py
    │           │   ├── metrics.py
    │           │   ├── setup.py
    │           │   └── utils.py
    │           ├── fsdp
    │           │   └── __init__.py
    │           ├── layers
    │           │   ├── __init__.py
    │           │   ├── attention.py
    │           │   ├── block.py
    │           │   ├── dino_head.py
    │           │   ├── drop_path.py
    │           │   ├── layer_scale.py
    │           │   ├── mlp.py
    │           │   ├── patch_embed.py
    │           │   └── swiglu_ffn.py
    │           ├── logging
    │           │   ├── __init__.py
    │           │   └── helpers.py
    │           ├── loss
    │           │   ├── __init__.py
    │           │   ├── dino_clstoken_loss.py
    │           │   ├── ibot_patch_loss.py
    │           │   └── koleo_loss.py
    │           ├── models
    │           │   ├── __init__.py
    │           │   └── vision_transformer.py
    │           ├── run
    │           │   ├── __init__.py
    │           │   ├── eval
    │           │   │   ├── knn.py
    │           │   │   ├── linear.py
    │           │   │   └── log_regression.py
    │           │   ├── submit.py
    │           │   └── train
    │           │   │   └── train.py
    │           ├── train
    │           │   ├── __init__.py
    │           │   ├── ssl_meta_arch.py
    │           │   └── train.py
    │           └── utils
    │           │   ├── __init__.py
    │           │   ├── cluster.py
    │           │   ├── config.py
    │           │   ├── dtype.py
    │           │   ├── param_groups.py
    │           │   └── utils.py
    │       ├── hubconf.py
    │       ├── pyproject.toml
    │       ├── requirements-dev.txt
    │       ├── requirements.txt
    │       ├── scripts
    │           └── lint.sh
    │       ├── setup.cfg
    │       ├── setup.py
    │       ├── utils.py
    │       └── vision_transformer.py
├── DistDepth
    ├── README.md
    ├── demo.py
    ├── layers.py
    ├── networks
    │   ├── __init__.py
    │   ├── depth_decoder.py
    │   ├── pose_decoder.py
    │   └── resnet_encoder.py
    ├── options.py
    ├── split_files.txt
    └── utils.py
├── GLPDepth
    ├── README.md
    └── demo_glpn.py
├── InSpaceType_meta.csv
├── IronDepth
    ├── README.md
    ├── data
    │   ├── dataloader_custom.py
    │   └── dataloader_custom_rev.py
    ├── models
    │   ├── IronDepth.py
    │   └── submodules
    │   │   ├── DNET.py
    │   │   ├── D_submodules.py
    │   │   └── Dr_submodules.py
    ├── models_normal
    │   ├── NNET.py
    │   └── submodules.py
    ├── requirements.txt
    ├── test.py
    └── utils
    │   └── utils.py
├── MIM
    ├── README.md
    ├── configs
    │   ├── base_options.py
    │   ├── test_options.py
    │   └── train_options.py
    ├── dataset
    │   ├── base_dataset.py
    │   ├── filenames
    │   │   ├── eigen_benchmark
    │   │   │   ├── test_list.txt
    │   │   │   └── train_list.txt
    │   │   └── nyudepthv2
    │   │   │   ├── split_files.txt
    │   │   │   ├── test_list.txt
    │   │   │   └── train_list.txt
    │   ├── imagepath.py
    │   ├── kitti.py
    │   └── nyudepthv2.py
    ├── models
    │   ├── checkpoint.py
    │   ├── model.py
    │   ├── optimizer.py
    │   └── swin_transformer_v2.py
    ├── requirements.txt
    ├── test.py
    └── utils
    │   ├── criterion.py
    │   ├── extract_official_train_test_set_from_mat.py
    │   ├── logging.py
    │   └── metrics.py
├── NeWCRFs
    ├── README.md
    ├── configs
    │   ├── arguments_eval_kittieigen.txt
    │   ├── arguments_eval_nyu.txt
    │   ├── arguments_train_kittieigen.txt
    │   └── arguments_train_nyu.txt
    ├── data_splits
    │   ├── eigen_test_files_with_gt.txt
    │   ├── eigen_train_files_with_gt.txt
    │   ├── kitti_depth_prediction_train.txt
    │   ├── kitti_official_test.txt
    │   ├── kitti_official_valid.txt
    │   ├── nyudepthv2_test_files_with_gt.txt
    │   ├── nyudepthv2_train_files_with_gt_dense.txt
    │   ├── split_files.txt
    │   └── test.txt
    └── newcrfs
    │   ├── dataloaders
    │       ├── __init__.py
    │       ├── dataloader.py
    │       └── dataloader_kittipred.py
    │   ├── demo.py
    │   ├── eval.py
    │   ├── networks
    │       ├── NewCRFDepth.py
    │       ├── __init__.py
    │       ├── newcrf_layers.py
    │       ├── newcrf_utils.py
    │       ├── swin_transformer.py
    │       └── uper_crf_head.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils.py
├── PixelFormer
    ├── README.md
    ├── configs
    │   ├── arguments_eval_kittieigen.txt
    │   ├── arguments_eval_nyu.txt
    │   ├── arguments_train_kittieigen.txt
    │   └── arguments_train_nyu.txt
    ├── data_splits
    │   ├── eigen_test_files_with_gt.txt
    │   ├── eigen_train_files_with_gt.txt
    │   ├── kitti_depth_prediction_train.txt
    │   ├── kitti_official_test.txt
    │   ├── kitti_official_valid.txt
    │   ├── nyudepthv2_test_files_with_gt.txt
    │   ├── nyudepthv2_train_files_with_gt_dense.txt
    │   ├── split_files.txt
    │   └── test.txt
    └── pixelformer
    │   ├── dataloaders
    │       ├── __init__.py
    │       ├── dataloader.py
    │       └── dataloader_kittipred.py
    │   ├── demo.py
    │   ├── eval.py
    │   ├── load.py
    │   ├── networks
    │       ├── PQI.py
    │       ├── PixelFormer.py
    │       ├── SAM.py
    │       ├── __init__.py
    │       ├── swin_transformer.py
    │       └── utils.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils.py
├── README.md
├── Unidepth
    ├── LICENSE
    ├── README.md
    ├── assets
    │   ├── demo
    │   │   ├── depth.png
    │   │   ├── intrinsics.npy
    │   │   ├── output.png
    │   │   └── rgb.png
    │   └── docs
    │   │   ├── nuscenes_surround.gif
    │   │   ├── theoffice.gif
    │   │   └── unidepth-banner.png
    ├── configs
    │   ├── config_v1_cnvnxtl.json
    │   └── config_v1_vitl14.json
    ├── demo.py
    ├── hubconf.py
    ├── install.sh
    ├── pyproject.toml
    ├── requirements.txt
    └── unidepth
    │   ├── __init__.py
    │   ├── layers
    │       ├── __init__.py
    │       ├── activation.py
    │       ├── attention.py
    │       ├── convnext.py
    │       ├── drop_path.py
    │       ├── layer_scale.py
    │       ├── mlp.py
    │       ├── nystrom_attention.py
    │       ├── positional_encoding.py
    │       └── upsample.py
    │   ├── models
    │       ├── __init__.py
    │       ├── backbones
    │       │   ├── __init__.py
    │       │   ├── convnext.py
    │       │   ├── convnext2.py
    │       │   ├── dinov2.py
    │       │   └── metadinov2
    │       │   │   ├── __init__.py
    │       │   │   ├── attention.py
    │       │   │   ├── block.py
    │       │   │   ├── dino_head.py
    │       │   │   ├── drop_path.py
    │       │   │   ├── layer_scale.py
    │       │   │   ├── mlp.py
    │       │   │   ├── patch_embed.py
    │       │   │   └── swiglu_ffn.py
    │       ├── encoder.py
    │       └── unidepthv1
    │       │   ├── __init__.py
    │       │   ├── decoder.py
    │       │   └── unidepthv1.py
    │   ├── ops
    │       ├── __init__.py
    │       ├── losses.py
    │       └── scheduler.py
    │   └── utils
    │       ├── __init__.py
    │       ├── constants.py
    │       ├── distributed.py
    │       ├── ema_torch.py
    │       ├── evaluation_depth.py
    │       ├── geometric.py
    │       ├── misc.py
    │       ├── positional_embedding.py
    │       ├── sht.py
    │       └── visualization.py
├── VPD
    ├── LICENSE
    ├── README.md
    ├── depth
    │   ├── README.md
    │   ├── configs
    │   │   ├── base_options.py
    │   │   ├── test_options.py
    │   │   └── train_options.py
    │   ├── dataset
    │   │   ├── base_dataset.py
    │   │   ├── filenames
    │   │   │   └── nyudepthv2
    │   │   │   │   ├── split_files.txt
    │   │   │   │   └── split_files_sml.txt
    │   │   ├── imagepath.py
    │   │   └── nyudepthv2.py
    │   ├── dump_nyu_text_embeddings.py
    │   ├── extract_official_train_test_set_from_mat.py
    │   ├── models_depth
    │   │   ├── checkpoint.py
    │   │   ├── model.py
    │   │   └── optimizer.py
    │   ├── nyu_class_embeddings.pth
    │   ├── nyu_class_list.json
    │   ├── requirements.txt
    │   ├── splits.mat
    │   ├── src
    │   │   └── clip
    │   │   │   ├── .gitignore
    │   │   │   ├── LICENSE
    │   │   │   ├── MANIFEST.in
    │   │   │   ├── README.md
    │   │   │   ├── clip
    │   │   │       ├── __init__.py
    │   │   │       ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │       ├── clip.py
    │   │   │       ├── model.py
    │   │   │       └── simple_tokenizer.py
    │   │   │   ├── data
    │   │   │       ├── country211.md
    │   │   │       ├── prompts.md
    │   │   │       ├── rendered-sst2.md
    │   │   │       └── yfcc100m.md
    │   │   │   ├── hubconf.py
    │   │   │   ├── model-card.md
    │   │   │   ├── notebooks
    │   │   │       ├── Interacting_with_CLIP.ipynb
    │   │   │       └── Prompt_Engineering_for_ImageNet.ipynb
    │   │   │   ├── requirements.txt
    │   │   │   ├── setup.py
    │   │   │   └── tests
    │   │   │       └── test_consistency.py
    │   ├── test.py
    │   ├── test.sh
    │   ├── train.py
    │   ├── train.sh
    │   ├── utils.py
    │   ├── utils_depth
    │   │   ├── criterion.py
    │   │   ├── logging.py
    │   │   └── metrics.py
    │   └── v1-inference.yaml
    ├── refer
    │   ├── README.md
    │   ├── args.py
    │   ├── data
    │   │   └── dataset_refer_clip.py
    │   ├── models_refer
    │   │   ├── __init__.py
    │   │   └── model.py
    │   ├── refer
    │   │   ├── LICENSE
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── evaluation
    │   │   │   ├── __init__.py
    │   │   │   ├── bleu
    │   │   │   │   ├── LICENSE
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bleu.py
    │   │   │   │   └── bleu_scorer.py
    │   │   │   ├── cider
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cider.py
    │   │   │   │   └── cider_scorer.py
    │   │   │   ├── meteor
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── meteor.py
    │   │   │   ├── readme.txt
    │   │   │   ├── refEvaluation.py
    │   │   │   ├── rouge
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── rouge.py
    │   │   │   └── tokenizer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ptbtokenizer.py
    │   │   │   │   └── stanford-corenlp-3.4.1.jar
    │   │   ├── external
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── _mask.pyx
    │   │   │   ├── mask.py
    │   │   │   ├── maskApi.c
    │   │   │   └── maskApi.h
    │   │   ├── pyEvalDemo.ipynb
    │   │   ├── pyReferDemo.ipynb
    │   │   ├── refer.py
    │   │   ├── setup.py
    │   │   └── test
    │   │   │   ├── sample_expressions_testA.json
    │   │   │   └── sample_expressions_testB.json
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── test.sh
    │   ├── train.py
    │   ├── train.sh
    │   ├── transforms.py
    │   ├── utils.py
    │   └── v1-inference.yaml
    ├── segmentation
    │   ├── README.md
    │   ├── class_embeddings.pth
    │   ├── configs
    │   │   ├── _base_
    │   │   │   ├── datasets
    │   │   │   │   └── ade20k_vpd.py
    │   │   │   ├── default_runtime.py
    │   │   │   ├── models
    │   │   │   │   └── fpn_r50.py
    │   │   │   └── schedules
    │   │   │   │   ├── schedule_160k.py
    │   │   │   │   └── schedule_80k.py
    │   │   └── fpn_vpd_sd1-5_512x512_gpu8x2.py
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── vpd_seg.py
    │   ├── test.py
    │   └── train.py
    ├── stable-diffusion
    │   ├── LICENSE
    │   ├── README.md
    │   ├── Stable_Diffusion_v1_Model_Card.md
    │   ├── configs
    │   │   ├── autoencoder
    │   │   │   ├── autoencoder_kl_16x16x16.yaml
    │   │   │   ├── autoencoder_kl_32x32x4.yaml
    │   │   │   ├── autoencoder_kl_64x64x3.yaml
    │   │   │   └── autoencoder_kl_8x8x64.yaml
    │   │   ├── latent-diffusion
    │   │   │   ├── celebahq-ldm-vq-4.yaml
    │   │   │   ├── cin-ldm-vq-f8.yaml
    │   │   │   ├── cin256-v2.yaml
    │   │   │   ├── ffhq-ldm-vq-4.yaml
    │   │   │   ├── lsun_bedrooms-ldm-vq-4.yaml
    │   │   │   ├── lsun_churches-ldm-kl-8.yaml
    │   │   │   └── txt2img-1p4B-eval.yaml
    │   │   ├── retrieval-augmented-diffusion
    │   │   │   └── 768x768.yaml
    │   │   └── stable-diffusion
    │   │   │   └── v1-inference.yaml
    │   ├── environment.yaml
    │   ├── ldm
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── imagenet.py
    │   │   │   └── lsun.py
    │   │   ├── lr_scheduler.py
    │   │   ├── models
    │   │   │   ├── autoencoder.py
    │   │   │   └── diffusion
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── classifier.py
    │   │   │   │   ├── ddim.py
    │   │   │   │   ├── ddpm.py
    │   │   │   │   ├── dpm_solver
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── dpm_solver.py
    │   │   │   │       └── sampler.py
    │   │   │   │   └── plms.py
    │   │   ├── modules
    │   │   │   ├── attention.py
    │   │   │   ├── diffusionmodules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── openaimodel.py
    │   │   │   │   └── util.py
    │   │   │   ├── distributions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── distributions.py
    │   │   │   ├── ema.py
    │   │   │   ├── encoders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── modules.py
    │   │   │   ├── image_degradation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bsrgan.py
    │   │   │   │   ├── bsrgan_light.py
    │   │   │   │   ├── utils
    │   │   │   │   │   └── test.png
    │   │   │   │   └── utils_image.py
    │   │   │   ├── losses
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── contperceptual.py
    │   │   │   │   └── vqperceptual.py
    │   │   │   └── x_transformer.py
    │   │   └── util.py
    │   ├── main.py
    │   ├── models
    │   │   ├── first_stage_models
    │   │   │   ├── kl-f16
    │   │   │   │   └── config.yaml
    │   │   │   ├── kl-f32
    │   │   │   │   └── config.yaml
    │   │   │   ├── kl-f4
    │   │   │   │   └── config.yaml
    │   │   │   ├── kl-f8
    │   │   │   │   └── config.yaml
    │   │   │   ├── vq-f16
    │   │   │   │   └── config.yaml
    │   │   │   ├── vq-f4-noattn
    │   │   │   │   └── config.yaml
    │   │   │   ├── vq-f4
    │   │   │   │   └── config.yaml
    │   │   │   ├── vq-f8-n256
    │   │   │   │   └── config.yaml
    │   │   │   └── vq-f8
    │   │   │   │   └── config.yaml
    │   │   └── ldm
    │   │   │   ├── bsr_sr
    │   │   │       └── config.yaml
    │   │   │   ├── celeba256
    │   │   │       └── config.yaml
    │   │   │   ├── cin256
    │   │   │       └── config.yaml
    │   │   │   ├── ffhq256
    │   │   │       └── config.yaml
    │   │   │   ├── inpainting_big
    │   │   │       └── config.yaml
    │   │   │   ├── layout2img-openimages256
    │   │   │       └── config.yaml
    │   │   │   ├── lsun_beds256
    │   │   │       └── config.yaml
    │   │   │   ├── lsun_churches256
    │   │   │       └── config.yaml
    │   │   │   ├── semantic_synthesis256
    │   │   │       └── config.yaml
    │   │   │   ├── semantic_synthesis512
    │   │   │       └── config.yaml
    │   │   │   └── text2img256
    │   │   │       └── config.yaml
    │   ├── notebook_helpers.py
    │   ├── scripts
    │   │   ├── download_first_stages.sh
    │   │   ├── download_models.sh
    │   │   ├── img2img.py
    │   │   ├── inpaint.py
    │   │   ├── knn2img.py
    │   │   ├── sample_diffusion.py
    │   │   ├── tests
    │   │   │   └── test_watermark.py
    │   │   ├── train_searcher.py
    │   │   └── txt2img.py
    │   ├── setup.py
    │   └── src
    │   │   └── taming-transformers
    │   │       ├── License.txt
    │   │       ├── README.md
    │   │       ├── configs
    │   │           ├── coco_cond_stage.yaml
    │   │           ├── coco_scene_images_transformer.yaml
    │   │           ├── custom_vqgan.yaml
    │   │           ├── drin_transformer.yaml
    │   │           ├── faceshq_transformer.yaml
    │   │           ├── faceshq_vqgan.yaml
    │   │           ├── imagenet_vqgan.yaml
    │   │           ├── imagenetdepth_vqgan.yaml
    │   │           ├── open_images_scene_images_transformer.yaml
    │   │           └── sflckr_cond_stage.yaml
    │   │       ├── environment.yaml
    │   │       ├── main.py
    │   │       ├── scripts
    │   │           ├── extract_depth.py
    │   │           ├── extract_segmentation.py
    │   │           ├── extract_submodel.py
    │   │           ├── make_samples.py
    │   │           ├── make_scene_samples.py
    │   │           ├── sample_conditional.py
    │   │           └── sample_fast.py
    │   │       ├── setup.py
    │   │       └── taming
    │   │           ├── data
    │   │               ├── ade20k.py
    │   │               ├── annotated_objects_coco.py
    │   │               ├── annotated_objects_dataset.py
    │   │               ├── annotated_objects_open_images.py
    │   │               ├── base.py
    │   │               ├── coco.py
    │   │               ├── conditional_builder
    │   │               │   ├── objects_bbox.py
    │   │               │   ├── objects_center_points.py
    │   │               │   └── utils.py
    │   │               ├── custom.py
    │   │               ├── faceshq.py
    │   │               ├── helper_types.py
    │   │               ├── image_transforms.py
    │   │               ├── imagenet.py
    │   │               ├── open_images_helper.py
    │   │               ├── sflckr.py
    │   │               └── utils.py
    │   │           ├── lr_scheduler.py
    │   │           ├── models
    │   │               ├── cond_transformer.py
    │   │               ├── dummy_cond_stage.py
    │   │               └── vqgan.py
    │   │           ├── modules
    │   │               ├── diffusionmodules
    │   │               │   └── model.py
    │   │               ├── discriminator
    │   │               │   └── model.py
    │   │               ├── losses
    │   │               │   ├── __init__.py
    │   │               │   ├── lpips.py
    │   │               │   ├── segmentation.py
    │   │               │   └── vqperceptual.py
    │   │               ├── misc
    │   │               │   └── coord.py
    │   │               ├── transformer
    │   │               │   ├── mingpt.py
    │   │               │   └── permuter.py
    │   │               ├── util.py
    │   │               └── vqvae
    │   │               │   └── quantize.py
    │   │           └── util.py
    └── vpd
    │   ├── __init__.py
    │   └── models.py
├── ZoeDepth
    ├── LICENSE
    ├── README.md
    ├── assets
    │   └── zoedepth-teaser.png
    ├── demo.py
    ├── demo_local.py
    ├── environment.yml
    ├── evaluate.py
    ├── hubconf.py
    ├── notebooks
    │   └── ZoeDepth_quickstart.ipynb
    ├── sanity.py
    ├── sanity_hub.py
    ├── split_files.txt
    ├── train_mix.py
    ├── train_mono.py
    ├── train_test_inputs
    │   ├── kitti_eigen_test_files_with_gt.txt
    │   ├── kitti_eigen_train_files_with_gt.txt
    │   ├── nyudepthv2_test_files_with_gt.txt
    │   └── nyudepthv2_train_files_with_gt.txt
    ├── ui
    │   ├── app.py
    │   ├── gradio_depth_pred.py
    │   ├── gradio_im_to_3d.py
    │   ├── gradio_pano_to_3d.py
    │   └── ui_requirements.txt
    └── zoedepth
    │   ├── data
    │       ├── __init__.py
    │       ├── data_mono.py
    │       ├── ddad.py
    │       ├── diml_indoor_test.py
    │       ├── diml_outdoor_test.py
    │       ├── diode.py
    │       ├── hypersim.py
    │       ├── ibims.py
    │       ├── preprocess.py
    │       ├── sun_rgbd_loader.py
    │       ├── transforms.py
    │       ├── vkitti.py
    │       └── vkitti2.py
    │   ├── models
    │       ├── __init__.py
    │       ├── base_models
    │       │   ├── __init__.py
    │       │   └── midas.py
    │       ├── builder.py
    │       ├── depth_model.py
    │       ├── layers
    │       │   ├── attractor.py
    │       │   ├── dist_layers.py
    │       │   ├── localbins_layers.py
    │       │   └── patch_transformer.py
    │       ├── model_io.py
    │       ├── zoedepth
    │       │   ├── __init__.py
    │       │   ├── config_zoedepth.json
    │       │   ├── config_zoedepth_kitti.json
    │       │   └── zoedepth_v1.py
    │       └── zoedepth_nk
    │       │   ├── __init__.py
    │       │   ├── config_zoedepth_nk.json
    │       │   └── zoedepth_nk_v1.py
    │   ├── trainers
    │       ├── base_trainer.py
    │       ├── builder.py
    │       ├── loss.py
    │       ├── zoedepth_nk_trainer.py
    │       └── zoedepth_trainer.py
    │   └── utils
    │       ├── __init__.py
    │       ├── arg_utils.py
    │       ├── config.py
    │       ├── easydict
    │           └── __init__.py
    │       ├── geometry.py
    │       └── misc.py
├── bts
    ├── README.md
    ├── pytorch
    │   ├── bts.py
    │   ├── bts_dataloader.py
    │   ├── bts_eval.py
    │   ├── bts_live_3d.py
    │   ├── bts_main.py
    │   ├── bts_test.py
    │   ├── distributed_sampler_no_evenly_divisible.py
    │   ├── run_bts_eval_schedule.py
    │   └── run_bts_live_3d.sh
    ├── train_test_inputs
    │   ├── eigen_test_files_with_gt.txt
    │   ├── eigen_train_files_with_gt.txt
    │   ├── nyudepthv2_test_files_with_gt.txt
    │   ├── nyudepthv2_train_files_with_gt.txt
    │   └── split_files.txt
    └── utils
    │   ├── download_from_gdrive.py
    │   ├── eval_with_pngs.py
    │   ├── extract_official_train_test_set_from_mat.py
    │   ├── kitti_archives_to_download.txt
    │   ├── nyudepthv2_archives_to_download.txt
    │   ├── splits.mat
    │   ├── sync_project_frames_multi_threads.m
    │   └── train_scenes.txt
├── pics
    ├── dataset-1.png
    ├── dataset-2.png
    ├── fitting.png
    ├── group.png
    ├── heirarchy.png
    ├── mitigation.png
    ├── overall.png
    └── type.png
└── space_type_def.yml


/Adabins/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Adabins: Depth Estimation using adaptive bins
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX
 4 | 
 5 | 2. Download pretrained model 'AdaBins_nyu.pt' from [Official Link](https://drive.google.com/drive/folders/1nYyaQXOBjNdUJDsmJpcRpu6oE55aQoLA?usp=sharing) and put it under the folder 'pretrained'
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |   python demo.py -i ../InSpaceType
11 |   ```
12 | 
13 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
14 | 


--------------------------------------------------------------------------------
/Adabins/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .unet_adaptive_bins import UnetAdaptiveBins
2 | 


--------------------------------------------------------------------------------
/Adabins/models/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class PatchTransformerEncoder(nn.Module):
 6 |     def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4):
 7 |         super(PatchTransformerEncoder, self).__init__()
 8 |         encoder_layers = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward=1024)
 9 |         self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=4)  # takes shape S,N,E
10 | 
11 |         self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
12 |                                            kernel_size=patch_size, stride=patch_size, padding=0)
13 | 
14 |         self.positional_encodings = nn.Parameter(torch.rand(500, embedding_dim), requires_grad=True)
15 | 
16 |     def forward(self, x):
17 |         embeddings = self.embedding_convPxP(x).flatten(2)  # .shape = n,c,s = n, embedding_dim, s
18 |         # embeddings = nn.functional.pad(embeddings, (1,0))  # extra special token at start ?
19 |         embeddings = embeddings + self.positional_encodings[:embeddings.shape[2], :].T.unsqueeze(0)
20 | 
21 |         # change to S,N,E format required by transformer
22 |         embeddings = embeddings.permute(2, 0, 1)
23 |         x = self.transformer_encoder(embeddings)  # .shape = S, N, E
24 |         return x
25 | 
26 | 
27 | class PixelWiseDotProduct(nn.Module):
28 |     def __init__(self):
29 |         super(PixelWiseDotProduct, self).__init__()
30 | 
31 |     def forward(self, x, K):
32 |         n, c, h, w = x.size()
33 |         _, cout, ck = K.size()
34 |         assert c == ck, "Number of channels in x and Embedding dimension (at dim 2) of K matrix must match"
35 |         y = torch.matmul(x.view(n, c, h * w).permute(0, 2, 1), K.permute(0, 2, 1))  # .shape = n, hw, cout
36 |         return y.permute(0, 2, 1).view(n, cout, h, w)
37 | 


--------------------------------------------------------------------------------
/DPT/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Vision Transformers for Dense Prediction
 2 | 
 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt'''
 4 | 
 5 | 2. Download pretrained model 'dpt_hybrid_nyu-2ce69ec7.pt' from [Official Link](https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid_nyu-2ce69ec7.pt) and put it under the folder 'weights'
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |   python run_monodepth.py -t dpt_hybrid_nyu -i ../InSpaceType
11 |   ```
12 | 
13 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
14 | 


--------------------------------------------------------------------------------
/DPT/dpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DPT/dpt/__init__.py


--------------------------------------------------------------------------------
/DPT/dpt/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device("cpu"))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/DPT/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.8.1
2 | torchvision==0.9.1
3 | opencv-python==4.5.2.54
4 | timm==0.4.5
5 | 


--------------------------------------------------------------------------------
/DPT/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DPT/util/__init__.py


--------------------------------------------------------------------------------
/Decompose/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper  Depth Map Decomposition for Monocular Depth Estimation
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, pandas, opencv-python, tqdm, efficientnet_pytorch
 4 | 
 5 | 
 6 | 2. Download pretrained model '51k_HRWSI.pth' from [Official Link](https://drive.google.com/drive/folders/1zsgT_5AO89WxzlFI53gwjomisb_Gkcox?usp=sharing) and put it here.
 7 | 
 8 | 3.
 9 | 
10 |   ```
11 |   python demo.py --ckpt 51k_HRWSI.pth --filenames_file split_files.txt
12 |   ```
13 | 
14 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
15 | 


--------------------------------------------------------------------------------
/DepthAnything/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Depth-Anything
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, opencv-python
 4 | 
 5 | 2. Download Depth-Anything NYUv2 finetuned model (depth_anything_metric_depth_indoor.pt) from [official link](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints_metric_depth) and place it under 'metric_depth/checkpoints'
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |    cd metric_depth
11 | 
12 |    python demo.py --img-path ../../InSpaceType --outdir ./vis_depth
13 |   ```
14 | 
15 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
16 |   Colored visualization in metric depth are saved under --outdir
17 | 


--------------------------------------------------------------------------------
/DepthAnything/controlnet/README.md:
--------------------------------------------------------------------------------
 1 | ## Depth-Conditioned ControlNet based on Depth Anything
 2 | 
 3 | We use [Diffusers](https://github.com/huggingface/diffusers/tree/main) to re-train a better depth-conditioned ControlNet based on our Depth Anything.
 4 | 
 5 | Please download our [config file](./config.json) and [pre-trained weights](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints_controlnet), then follow the [instructions](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) in Diffusers for inference. 
 6 | 
 7 | ## Depth-to-Image Synthesis
 8 | 
 9 | ![demo2](../assets/controlnet_demo1.png)
10 | ![demo1](../assets/controlnet_demo2.png)
11 | 
12 | 
13 | ## Video Editing
14 | 
15 | Please refer to our [project page](https://depth-anything.github.io/). We use [MagicEdit](https://github.com/magic-research/magic-edit) to show demos of video editing based on depth information.
16 | 


--------------------------------------------------------------------------------
/DepthAnything/controlnet/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "ControlNetModel",
 3 |   "_diffusers_version": "0.26.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "addition_embed_type": null,
 6 |   "addition_embed_type_num_heads": 64,
 7 |   "addition_time_embed_dim": null,
 8 |   "attention_head_dim": 8,
 9 |   "block_out_channels": [
10 |     320,
11 |     640,
12 |     1280,
13 |     1280
14 |   ],
15 |   "class_embed_type": null,
16 |   "conditioning_channels": 3,
17 |   "conditioning_embedding_out_channels": [
18 |     16,
19 |     32,
20 |     96,
21 |     256
22 |   ],
23 |   "controlnet_conditioning_channel_order": "rgb",
24 |   "cross_attention_dim": 768,
25 |   "down_block_types": [
26 |     "CrossAttnDownBlock2D",
27 |     "CrossAttnDownBlock2D",
28 |     "CrossAttnDownBlock2D",
29 |     "DownBlock2D"
30 |   ],
31 |   "downsample_padding": 1,
32 |   "encoder_hid_dim": null,
33 |   "encoder_hid_dim_type": null,
34 |   "flip_sin_to_cos": true,
35 |   "freq_shift": 0,
36 |   "global_pool_conditions": false,
37 |   "in_channels": 4,
38 |   "layers_per_block": 2,
39 |   "mid_block_scale_factor": 1,
40 |   "mid_block_type": "UNetMidBlock2DCrossAttn",
41 |   "norm_eps": 1e-05,
42 |   "norm_num_groups": 32,
43 |   "num_attention_heads": null,
44 |   "num_class_embeds": null,
45 |   "only_cross_attention": false,
46 |   "projection_class_embeddings_input_dim": null,
47 |   "resnet_time_scale_shift": "default",
48 |   "transformer_layers_per_block": 1,
49 |   "upcast_attention": false,
50 |   "use_linear_projection": false
51 | }
52 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/checkpoints/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DepthAnything/metric_depth/checkpoints/.placeholder


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/environment.yml:
--------------------------------------------------------------------------------
 1 | name: zoe
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 |   - conda-forge
 6 | dependencies:
 7 |   - cuda=11.7.1
 8 |   - h5py=3.7.0
 9 |   - hdf5=1.12.2
10 |   - matplotlib=3.6.2
11 |   - matplotlib-base=3.6.2
12 |   - numpy=1.24.1
13 |   - opencv=4.6.0
14 |   - pip=22.3.1
15 |   - python=3.9.7
16 |   - pytorch=1.13.1
17 |   - pytorch-cuda=11.7
18 |   - pytorch-mutex=1.0
19 |   - scipy=1.10.0
20 |   - torchaudio=0.13.1
21 |   - torchvision=0.14.1
22 |   - pip:
23 |     - huggingface-hub==0.11.1
24 |     - timm==0.6.12
25 |     - tqdm==4.64.1
26 |     - wandb==0.13.9
27 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/models/base_models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/models/zoedepth/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_v1 import ZoeDepth 
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepth,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/models/zoedepth/config_zoedepth_kitti.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "bin_centers_type": "normed",
 4 |         "img_size": [384, 768]
 5 |     },
 6 |     
 7 |     "train": {
 8 |     },
 9 | 
10 |     "infer":{
11 |         "train_midas": false,
12 |         "use_pretrained_midas": false,
13 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt",
14 |         "force_keep_ar": true
15 |     },
16 | 
17 |     "eval":{
18 |         "train_midas": false,
19 |         "use_pretrained_midas": false,
20 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
21 |     }
22 | }


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/models/zoedepth_nk/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_nk_v1 import ZoeDepthNK
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepthNK,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/DepthAnything/metric_depth/zoedepth/utils/arg_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def infer_type(x):  # hacky way to infer type from string args
 4 |     if not isinstance(x, str):
 5 |         return x
 6 | 
 7 |     try:
 8 |         x = int(x)
 9 |         return x
10 |     except ValueError:
11 |         pass
12 | 
13 |     try:
14 |         x = float(x)
15 |         return x
16 |     except ValueError:
17 |         pass
18 | 
19 |     return x
20 | 
21 | 
22 | def parse_unknown(unknown_args):
23 |     clean = []
24 |     for a in unknown_args:
25 |         if "=" in a:
26 |             k, v = a.split("=")
27 |             clean.extend([k, v])
28 |         else:
29 |             clean.append(a)
30 | 
31 |     keys = clean[::2]
32 |     values = clean[1::2]
33 |     return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}
34 | 


--------------------------------------------------------------------------------
/DepthAnything/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio_imageslider
2 | gradio==4.14.0
3 | torch
4 | torchvision
5 | opencv-python
6 | huggingface_hub


--------------------------------------------------------------------------------
/DepthAnything/semseg/dinov2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mmengine.model import BaseModule
 3 | from torch import nn
 4 | 
 5 | from mmseg.registry import MODELS
 6 | 
 7 | 
 8 | 
 9 | @MODELS.register_module()
10 | class DINOv2(nn.Module):
11 |     """Use DINOv2 pre-trained models
12 |     """
13 | 
14 |     def __init__(self, version='large', freeze=False, load_from=None):
15 |         super().__init__()
16 |         
17 |         if version == 'large':
18 |             self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False)
19 |         else:
20 |             raise NotImplementedError
21 | 
22 |         if load_from is not None:
23 |             d = torch.load(load_from, map_location='cpu')
24 |             new_d = {}
25 |             for key, value in d.items():
26 |                 if 'pretrained' in key:
27 |                     new_d[key.replace('pretrained.', '')] = value
28 |             self.dinov2.load_state_dict(new_d)
29 |         
30 |         self.freeze = freeze
31 |         
32 |     def forward(self, inputs):
33 |         B, _, h, w = inputs.shape
34 |         
35 |         if self.freeze:
36 |             with torch.no_grad():
37 |                 features = self.dinov2.get_intermediate_layers(inputs, 4)
38 |         else:
39 |             features = self.dinov2.get_intermediate_layers(inputs, 4)
40 |         
41 |         outs = []
42 |         for feature in features:
43 |             C = feature.shape[-1]
44 |             feature = feature.permute(0, 2, 1).reshape(B, C, h // 14, w // 14).contiguous()
45 |             outs.append(feature)
46 |         
47 |         return outs
48 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/README.md:
--------------------------------------------------------------------------------
1 | # Local PyTorch Hub
2 | 
3 | This directory is for loading the DINOv2 encoder locally in case of no Internet connection.
4 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to DINOv2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to DINOv2, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: dinov2
 2 | channels:
 3 |   - defaults
 4 |   - pytorch
 5 |   - nvidia
 6 |   - xformers
 7 |   - conda-forge
 8 | dependencies:
 9 |   - python=3.9
10 |   - pytorch::pytorch=2.0.0
11 |   - pytorch::pytorch-cuda=11.7.0
12 |   - pytorch::torchvision=0.15.0
13 |   - omegaconf
14 |   - torchmetrics=0.10.3
15 |   - fvcore
16 |   - iopath
17 |   - xformers::xformers=0.0.18
18 |   - pip
19 |   - pip:
20 |     - git+https://github.com/facebookincubator/submitit
21 |     - --extra-index-url https://pypi.nvidia.com
22 |     - cuml-cu11
23 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | __version__ = "0.0.1"
8 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import pathlib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | 
12 | def load_config(config_name: str):
13 |     config_filename = config_name + ".yaml"
14 |     return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
15 | 
16 | 
17 | dinov2_default_config = load_config("ssl_default_config")
18 | 
19 | 
20 | def load_and_merge_config(config_name: str):
21 |     default_config = OmegaConf.create(dinov2_default_config)
22 |     loaded_config = load_config(config_name)
23 |     return OmegaConf.merge(default_config, loaded_config)
24 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_base
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_giant2
3 |   patch_size: 14
4 |   ffn_layer: swiglufused
5 | crops:
6 |   global_crops_size: 518  # this is to set up the position embeddings properly
7 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_large
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_small
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 12
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_giant2
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 32
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_large
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml:
--------------------------------------------------------------------------------
1 | # this corresponds to the default config
2 | train:
3 |   dataset_path: ImageNet:split=TRAIN
4 |   batch_size_per_gpu: 64
5 | student:
6 |   block_chunks: 4
7 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .adapters import DatasetWithEnumeratedTargets
 8 | from .loaders import make_data_loader, make_dataset, SamplerType
 9 | from .collate import collate_data_and_cast
10 | from .masking import MaskingGenerator
11 | from .augmentations import DataAugmentationDINO
12 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Any, Tuple
 8 | 
 9 | from torch.utils.data import Dataset
10 | 
11 | 
12 | class DatasetWithEnumeratedTargets(Dataset):
13 |     def __init__(self, dataset):
14 |         self._dataset = dataset
15 | 
16 |     def get_image_data(self, index: int) -> bytes:
17 |         return self._dataset.get_image_data(index)
18 | 
19 |     def get_target(self, index: int) -> Tuple[Any, int]:
20 |         target = self._dataset.get_target(index)
21 |         return (index, target)
22 | 
23 |     def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
24 |         image, target = self._dataset[index]
25 |         target = index if target is None else target
26 |         return image, (index, target)
27 | 
28 |     def __len__(self) -> int:
29 |         return len(self._dataset)
30 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | from .image_net import ImageNet
8 | from .image_net_22k import ImageNet22k
9 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from io import BytesIO
 8 | from typing import Any
 9 | 
10 | from PIL import Image
11 | 
12 | 
13 | class Decoder:
14 |     def decode(self) -> Any:
15 |         raise NotImplementedError
16 | 
17 | 
18 | class ImageDataDecoder(Decoder):
19 |     def __init__(self, image_data: bytes) -> None:
20 |         self._image_data = image_data
21 | 
22 |     def decode(self) -> Image:
23 |         f = BytesIO(self._image_data)
24 |         return Image.open(f).convert(mode="RGB")
25 | 
26 | 
27 | class TargetDecoder(Decoder):
28 |     def __init__(self, target: Any):
29 |         self._target = target
30 | 
31 |     def decode(self) -> Any:
32 |         return self._target
33 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Any, Tuple
 8 | 
 9 | from torchvision.datasets import VisionDataset
10 | 
11 | from .decoders import TargetDecoder, ImageDataDecoder
12 | 
13 | 
14 | class ExtendedVisionDataset(VisionDataset):
15 |     def __init__(self, *args, **kwargs) -> None:
16 |         super().__init__(*args, **kwargs)  # type: ignore
17 | 
18 |     def get_image_data(self, index: int) -> bytes:
19 |         raise NotImplementedError
20 | 
21 |     def get_target(self, index: int) -> Any:
22 |         raise NotImplementedError
23 | 
24 |     def __getitem__(self, index: int) -> Tuple[Any, Any]:
25 |         try:
26 |             image_data = self.get_image_data(index)
27 |             image = ImageDataDecoder(image_data).decode()
28 |         except Exception as e:
29 |             raise RuntimeError(f"can not read image for sample {index}") from e
30 |         target = self.get_target(index)
31 |         target = TargetDecoder(target).decode()
32 | 
33 |         if self.transforms is not None:
34 |             image, target = self.transforms(image, target)
35 | 
36 |         return image, target
37 | 
38 |     def __len__(self) -> int:
39 |         raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .dino_head import DINOHead
 8 | from .mlp import Mlp
 9 | from .patch_embed import PatchEmbed
10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
11 | from .block import NestedTensorBlock
12 | from .attention import MemEffAttention
13 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | from torch import nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
20 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 |     if keep_prob > 0.0:
22 |         random_tensor.div_(keep_prob)
23 |     output = x * random_tensor
24 |     return output
25 | 
26 | 
27 | class DropPath(nn.Module):
28 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 | 
30 |     def __init__(self, drop_prob=None):
31 |         super(DropPath, self).__init__()
32 |         self.drop_prob = drop_prob
33 | 
34 |     def forward(self, x):
35 |         return drop_path(x, self.drop_prob, self.training)
36 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .dino_clstoken_loss import DINOLoss
 8 | from .ibot_patch_loss import iBOTPatchLoss
 9 | from .koleo_loss import KoLeoLoss
10 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | # import torch.distributed as dist
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class KoLeoLoss(nn.Module):
20 |     """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
21 | 
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.pdist = nn.PairwiseDistance(2, eps=1e-8)
25 | 
26 |     def pairwise_NNs_inner(self, x):
27 |         """
28 |         Pairwise nearest neighbors for L2-normalized vectors.
29 |         Uses Torch rather than Faiss to remain on GPU.
30 |         """
31 |         # parwise dot products (= inverse distance)
32 |         dots = torch.mm(x, x.t())
33 |         n = x.shape[0]
34 |         dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
35 |         # max inner prod -> min distance
36 |         _, I = torch.max(dots, dim=1)  # noqa: E741
37 |         return I
38 | 
39 |     def forward(self, student_output, eps=1e-8):
40 |         """
41 |         Args:
42 |             student_output (BxD): backbone output of student
43 |         """
44 |         with torch.cuda.amp.autocast(enabled=False):
45 |             student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
46 |             I = self.pairwise_NNs_inner(student_output)  # noqa: E741
47 |             distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
48 |             loss = -torch.log(distances + eps).mean()
49 |         return loss
50 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | 
 9 | from . import vision_transformer as vits
10 | 
11 | 
12 | logger = logging.getLogger("dinov2")
13 | 
14 | 
15 | def build_model(args, only_teacher=False, img_size=224):
16 |     args.arch = args.arch.removesuffix("_memeff")
17 |     if "vit" in args.arch:
18 |         vit_kwargs = dict(
19 |             img_size=img_size,
20 |             patch_size=args.patch_size,
21 |             init_values=args.layerscale,
22 |             ffn_layer=args.ffn_layer,
23 |             block_chunks=args.block_chunks,
24 |             qkv_bias=args.qkv_bias,
25 |             proj_bias=args.proj_bias,
26 |             ffn_bias=args.ffn_bias,
27 |         )
28 |         teacher = vits.__dict__[args.arch](**vit_kwargs)
29 |         if only_teacher:
30 |             return teacher, teacher.embed_dim
31 |         student = vits.__dict__[args.arch](
32 |             **vit_kwargs,
33 |             drop_path_rate=args.drop_path_rate,
34 |             drop_path_uniform=args.drop_path_uniform,
35 |         )
36 |         embed_dim = student.embed_dim
37 |     return student, teacher, embed_dim
38 | 
39 | 
40 | def build_model_from_cfg(cfg, only_teacher=False):
41 |     return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
42 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser
12 | from dinov2.logging import setup_logging
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Evaluator:
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.eval.knn import main as knn_main
25 | 
26 |         self._setup_args()
27 |         knn_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 k-NN evaluation"
47 |     knn_args_parser = get_knn_args_parser(add_help=False)
48 |     parents = [knn_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Evaluator, args, name="dinov2:knn")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser
12 | from dinov2.logging import setup_logging
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Evaluator:
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.eval.linear import main as linear_main
25 | 
26 |         self._setup_args()
27 |         linear_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 linear evaluation"
47 |     linear_args_parser = get_linear_args_parser(add_help=False)
48 |     parents = [linear_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Evaluator, args, name="dinov2:linear")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.logging import setup_logging
12 | from dinov2.train import get_args_parser as get_train_args_parser
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Trainer(object):
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.train import main as train_main
25 | 
26 |         self._setup_args()
27 |         train_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 training"
47 |     train_args_parser = get_train_args_parser(add_help=False)
48 |     parents = [train_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Trainer, args, name="dinov2:train")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | from .train import get_args_parser, main
8 | from .ssl_meta_arch import SSLMetaArch
9 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | from typing import Dict, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | 
13 | 
14 | TypeSpec = Union[str, np.dtype, torch.dtype]
15 | 
16 | 
17 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
18 |     np.dtype("bool"): torch.bool,
19 |     np.dtype("uint8"): torch.uint8,
20 |     np.dtype("int8"): torch.int8,
21 |     np.dtype("int16"): torch.int16,
22 |     np.dtype("int32"): torch.int32,
23 |     np.dtype("int64"): torch.int64,
24 |     np.dtype("float16"): torch.float16,
25 |     np.dtype("float32"): torch.float32,
26 |     np.dtype("float64"): torch.float64,
27 |     np.dtype("complex64"): torch.complex64,
28 |     np.dtype("complex128"): torch.complex128,
29 | }
30 | 
31 | 
32 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
33 |     if isinstance(dtype, torch.dtype):
34 |         return dtype
35 |     if isinstance(dtype, str):
36 |         dtype = np.dtype(dtype)
37 |     assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
38 |     return _NUMPY_TO_TORCH_DTYPE[dtype]
39 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | 
 4 | [tool.pylint.master]
 5 | persistent = false
 6 | score = false
 7 | 
 8 | [tool.pylint.messages_control]
 9 | disable = "all"
10 | enable = [
11 |   "miscellaneous",
12 |   "similarities",
13 | ]
14 | 
15 | [tool.pylint.similarities]
16 | ignore-comments = true
17 | ignore-docstrings = true
18 | ignore-imports = true
19 | min-similarity-lines = 8
20 | 
21 | [tool.pylint.reports]
22 | reports = false
23 | 
24 | [tool.pylint.miscellaneous]
25 | notes = [
26 |   "FIXME",
27 |   "XXX",
28 |   "TODO",
29 | ]
30 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black==22.6.0
2 | flake8==5.0.4
3 | pylint==2.15.0
4 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu117
 2 | torch==2.0.0
 3 | torchvision==0.15.0
 4 | omegaconf
 5 | torchmetrics==0.10.3
 6 | fvcore
 7 | iopath
 8 | xformers==0.0.18
 9 | submitit
10 | --extra-index-url https://pypi.nvidia.com
11 | cuml-cu11
12 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/scripts/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ -n "$1" ]; then
 4 |   echo "linting \"$1\""
 5 | fi
 6 | 
 7 | echo "running black"
 8 | if [ -n "$1" ]; then
 9 |   black "$1"
10 | else
11 |   black dinov2
12 | fi
13 | 
14 | echo "running flake8"
15 | if [ -n "$1" ]; then
16 |   flake8 "$1"
17 | else
18 |   flake8
19 | fi
20 | 
21 | echo "running pylint"
22 | if [ -n "$1" ]; then
23 |   pylint "$1"
24 | else
25 |   pylint dinov2
26 | fi
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E203,E501,W503
4 | per-file-ignores =
5 |   __init__.py:F401
6 | exclude =
7 |     venv
8 | 


--------------------------------------------------------------------------------
/DepthAnything/torchhub/facebookresearch_dinov2_main/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | #
 3 | # This source code is licensed under the Apache License, Version 2.0
 4 | # found in the LICENSE file in the root directory of this source tree.
 5 | 
 6 | import itertools
 7 | import math
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | 
14 | _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
15 | 
16 | 
17 | def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
18 |     compact_arch_name = arch_name.replace("_", "")[:4]
19 |     registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
20 |     return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
21 | 
22 | 
23 | class CenterPadding(nn.Module):
24 |     def __init__(self, multiple):
25 |         super().__init__()
26 |         self.multiple = multiple
27 | 
28 |     def _get_pad(self, size):
29 |         new_size = math.ceil(size / self.multiple) * self.multiple
30 |         pad_size = new_size - size
31 |         pad_size_left = pad_size // 2
32 |         pad_size_right = pad_size - pad_size_left
33 |         return pad_size_left, pad_size_right
34 | 
35 |     @torch.inference_mode()
36 |     def forward(self, x):
37 |         pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
38 |         output = F.pad(x, pads)
39 |         return output
40 | 


--------------------------------------------------------------------------------
/DistDepth/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Toward Practical Monocular Indoor Depth Estimation
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX
 4 | 
 5 | 2. Download pretrained model from [Official Link](https://drive.google.com/file/d/1kLJBuMOf0xSpYq7DtxnPpBTxMwW0ylGm/view?usp=sharing) and extract under 'ckpts-finetuned'. Speficially, ckpts-finetuned should contain encoder.pth and decoder.pth 
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |   python demo.py
11 |   ```
12 |   
13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
14 | 
15 | 


--------------------------------------------------------------------------------
/DistDepth/networks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | from .resnet_encoder import ResnetEncoder, ResnetEncoderMatching
8 | from .depth_decoder import DepthDecoder
9 | from .pose_decoder import PoseDecoder


--------------------------------------------------------------------------------
/GLPDepth/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, pandas, transformers, opencv-python, tqdm 
 4 | 
 5 | 2.
 6 | 
 7 |   ```
 8 |   python demo_glpn.py -i ../InSpaceType
 9 |   ```
10 | 
11 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
12 | 


--------------------------------------------------------------------------------
/IronDepth/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper  Iterative Refinement of Single-View Depth using Surface Normal and its Uncertainty
 2 | 
 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt'''
 4 | 
 5 | 2. Go to this [Official Link](https://drive.google.com/drive/folders/1idIVqOrJOK6kuidBng1K8sth-CyOfcCj?usp=sharing), and
 6 | 
 7 | * Download `*.pt` and place them under `./checkpoints`. Specifically the 'checkpoints' folder should include irondepth_* and normal_* four checkpoints
 8 | 
 9 | 
10 | 3.
11 | 
12 |   ```
13 |    python test.py --train_data nyuv2 --test_data custom
14 |   ```
15 | 
16 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.


--------------------------------------------------------------------------------
/IronDepth/models_normal/NNET.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from models_normal.submodules import EESNU
 6 | 
 7 | 
 8 | class NNET(nn.Module):
 9 |     def __init__(self, args):
10 |         super(NNET, self).__init__()
11 |         self.min_kappa = 0.01
12 |         self.output_dim = 1
13 |         self.output_type = 'G'
14 | 
15 |         if args.NNET_architecture == 'BN':
16 |             self.n_net = EESNU(BN=True)
17 |         else:
18 |             self.n_net = EESNU(BN=False)
19 | 
20 |     def forward(self, img, **kwargs):
21 |         return self.n_net(img, **kwargs)
22 | 
23 | 


--------------------------------------------------------------------------------
/IronDepth/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.6.0
2 | torchvision==0.7.0
3 | Pillow
4 | numpy
5 | matplotlib
6 | argparse
7 | tqdm


--------------------------------------------------------------------------------
/MIM/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Revealing the Dark Secrets of Masked Image Modeling (Depth Estimation)
 2 | 
 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt'''
 4 | 
 5 | 2. Download pretrained model 'nyudepthv2_swin_large.ckpt' from [Official Link](https://mailustceducn-my.sharepoint.com/:f:/g/personal/aa397601_mail_ustc_edu_cn/EkoYQyhiD6hJu9CGYLOwiF8BRqHgk8kX61NUcyfmdOUV7Q?e=h2uctw) and put it under the folder 'ckpt'
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |   python test.py --dataset nyudepthv2 --data_path ../data/ --max_depth 10.0 --max_depth_eval 10.0  --backbone swin_large_v2 --depths 2 2 18 2 --num_filters 32 32 32 --deconv_kernels 2 2 2 --window_size 30 30 30 15 --pretrain_window_size 12 12 12 6 --use_shift True True False False --flip_test --shift_window_test --shift_size 2 --do_evaluate --ckpt_dir ckpt/nyudepthv2_swin_large.ckpt 
11 |   ```
12 | 
13 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.


--------------------------------------------------------------------------------
/MIM/configs/test_options.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | from configs.base_options import BaseOptions
 7 | 
 8 | class TestOptions(BaseOptions):
 9 |     def initialize(self):
10 |         parser = BaseOptions.initialize(self)
11 |         parser.add_argument('--result_dir', type=str, default='./results',
12 |                             help='save result images into result_dir/exp_name')
13 |         parser.add_argument('--ckpt_dir',   type=str,
14 |                             default='./ckpt/best_model_nyu.ckpt', 
15 |                             help='load ckpt path')
16 |         
17 |         parser.add_argument('--save_eval_pngs', action='store_true',
18 |                             help='save result image into evaluation form')
19 |         parser.add_argument('--save_visualize', action='store_true',
20 |                             help='save result image into visulized form')
21 |         parser.add_argument('--do_evaluate',    action='store_true',
22 |                             help='evaluate with inferenced images')   
23 |         
24 |         return parser
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/MIM/dataset/imagepath.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import cv2
 8 | from torch.utils.data import Dataset
 9 | import torchvision.transforms as transforms
10 | 
11 | class imagepath(Dataset):
12 |     # for test only
13 |     def __init__(self, data_path):
14 |         super().__init__()
15 | 
16 |         self.data_path = data_path
17 |         self.to_tensor = transforms.ToTensor()
18 | 
19 |         self.filenames_list = [os.path.join(data_path, i) for i in os.listdir(data_path)
20 |                                if i.split('.')[-1] in ['jpg', 'png']]
21 | 
22 |         print("Dataset : Image Path")
23 |         print("# of images: %d" % (len(self.filenames_list)))
24 | 
25 |     def __len__(self):
26 |         return len(self.filenames_list)
27 | 
28 |     def __getitem__(self, idx):
29 |         batch = {}
30 |         file = self.filenames_list[idx]
31 |         filename = file.split('/')[-1]
32 | 
33 |         image = cv2.imread(file)  # [H x W x C] and C: BGR
34 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
35 |         
36 |         # input size should be multiple of 32
37 |         h, w, c = image.shape
38 |         new_h, new_w = h // 32 * 32, w // 32 * 32
39 |         image = cv2.resize(image, (new_w, new_h))
40 |         image = self.to_tensor(image)
41 | 
42 |         batch['image'] = image
43 |         batch['filename'] = filename
44 | 
45 |         return batch
46 | 


--------------------------------------------------------------------------------
/MIM/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.6.0
2 | h5py>=3.6.0
3 | scipy>=1.7.3
4 | opencv-python>=4.5.5
5 | mmcv>=1.4.3
6 | timm>=0.5.4
7 | albumentations>=1.1.0
8 | tensorboardX>=2.4.1
9 | gdown>=4.2.1


--------------------------------------------------------------------------------
/MIM/utils/criterion.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | class SiLogLoss(nn.Module):
11 |     def __init__(self, lambd=0.5):
12 |         super().__init__()
13 |         self.lambd = lambd
14 | 
15 |     def forward(self, pred, target):
16 |         valid_mask = (target > 0).detach()
17 |         diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
18 |         loss = torch.sqrt(torch.pow(diff_log, 2).mean() -
19 |                           self.lambd * torch.pow(diff_log.mean(), 2))
20 | 
21 |         return loss
22 | 
23 | 


--------------------------------------------------------------------------------
/NeWCRFs/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper  NeW CRFs: Neural Window Fully-connected CRFs for Monocular Depth Estimation
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python
 4 | 
 5 | 2. Download pretrained model 'model_nyu.ckpt' from [Official Link](https://virutalbuy-public.oss-cn-hangzhou.aliyuncs.com/share/newcrfs/models/model_nyu.ckpt) and put it here
 6 | 
 7 | 3.
 8 | 
 9 |   ```  
10 |   python newcrfs/test.py --data_path ./ --dataset nyu --filenames_file data_splits/split_files.txt --checkpoint_path model_nyu.ckpt --max_depth 10
11 |   ```
12 | 
13 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
14 | 


--------------------------------------------------------------------------------
/NeWCRFs/configs/arguments_eval_kittieigen.txt:
--------------------------------------------------------------------------------
 1 | --model_name newcrfs_kittieigen
 2 | --encoder large07
 3 | --dataset kitti
 4 | --input_height 352
 5 | --input_width 1216
 6 | --max_depth 80
 7 | --do_kb_crop
 8 | 
 9 | --data_path_eval datasets/kitti/
10 | --gt_path_eval datasets/kitti/
11 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt
12 | --min_depth_eval 1e-3
13 | --max_depth_eval 80
14 | --garg_crop
15 | 
16 | --checkpoint_path model_zoo/model_kittieigen.ckpt


--------------------------------------------------------------------------------
/NeWCRFs/configs/arguments_eval_nyu.txt:
--------------------------------------------------------------------------------
 1 | --model_name newscrf_nyu
 2 | --encoder large07
 3 | --dataset nyu
 4 | --input_height 480
 5 | --input_width 640
 6 | --max_depth 10
 7 | 
 8 | --data_path_eval datasets/nyu/official_splits/test/
 9 | --gt_path_eval datasets/nyu/official_splits/test/
10 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt
11 | --min_depth_eval 1e-3
12 | --max_depth_eval 10
13 | --eigen_crop
14 | 
15 | --checkpoint_path model_zoo/model_nyu.ckpt


--------------------------------------------------------------------------------
/NeWCRFs/configs/arguments_train_kittieigen.txt:
--------------------------------------------------------------------------------
 1 | --mode train
 2 | --model_name newcrfs_kittieigen
 3 | --encoder large07
 4 | --pretrain model_zoo/swin_transformer/swin_large_patch4_window7_224_22k.pth
 5 | --dataset kitti
 6 | --data_path datasets/kitti/
 7 | --gt_path datasets/kitti/
 8 | --filenames_file data_splits/eigen_train_files_with_gt.txt
 9 | --batch_size 8
10 | --num_epochs 50
11 | --learning_rate 2e-5
12 | --weight_decay 1e-2
13 | --adam_eps 1e-3
14 | --num_threads 1
15 | --input_height 352
16 | --input_width 1120
17 | --max_depth 80
18 | --do_kb_crop
19 | --do_random_rotate
20 | --degree 1.0
21 | --log_directory ./models/
22 | --multiprocessing_distributed
23 | --dist_url tcp://127.0.0.1:2345
24 | 
25 | --log_freq 100
26 | --do_online_eval
27 | --eval_freq 1000
28 | --data_path_eval datasets/kitti/
29 | --gt_path_eval datasets/kitti/
30 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt
31 | --min_depth_eval 1e-3
32 | --max_depth_eval 80
33 | --garg_crop
34 | 


--------------------------------------------------------------------------------
/NeWCRFs/configs/arguments_train_nyu.txt:
--------------------------------------------------------------------------------
 1 | --mode train
 2 | --model_name newcrfs_nyu
 3 | --encoder large07
 4 | --pretrain model_zoo/swin_transformer/swin_large_patch4_window7_224_22k.pth
 5 | --dataset nyu
 6 | --data_path datasets/nyu/sync/
 7 | --gt_path datasets/nyu/sync/
 8 | --filenames_file data_splits/nyudepthv2_train_files_with_gt_dense.txt
 9 | --batch_size 8
10 | --num_epochs 50
11 | --learning_rate 2e-5
12 | --weight_decay 1e-2
13 | --adam_eps 1e-3
14 | --num_threads 1
15 | --input_height 480
16 | --input_width 640
17 | --max_depth 10
18 | --do_random_rotate
19 | --degree 2.5
20 | --log_directory ./models/
21 | --multiprocessing_distributed
22 | --dist_url tcp://127.0.0.1:2345
23 | 
24 | --log_freq 100
25 | --do_online_eval
26 | --eval_freq 1000
27 | --data_path_eval datasets/nyu/official_splits/test/
28 | --gt_path_eval datasets/nyu/official_splits/test/
29 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt
30 | --min_depth_eval 1e-3
31 | --max_depth_eval 10
32 | --eigen_crop
33 | 


--------------------------------------------------------------------------------
/NeWCRFs/data_splits/test.txt:
--------------------------------------------------------------------------------
1 | files/0007_L.jpg


--------------------------------------------------------------------------------
/NeWCRFs/newcrfs/dataloaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/NeWCRFs/newcrfs/dataloaders/__init__.py


--------------------------------------------------------------------------------
/NeWCRFs/newcrfs/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/NeWCRFs/newcrfs/networks/__init__.py


--------------------------------------------------------------------------------
/PixelFormer/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper PixelFormer: Attention Attention Everywhere: Monocular Depth Prediction with Skip Attention
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python
 4 | 
 5 | 2. Download pretrained model 'nyu.pt' from [Official Link](https://drive.google.com/drive/folders/1Feo67jEbccqa-HojTHG7ljTXOW2yuX-X?usp=share_link) and put it here
 6 | 
 7 | 3.
 8 | 
 9 |   ```
10 |   python pixelformer/test.py --data_path ./ --dataset nyu --filenames_file data_splits/split_files.txt --checkpoint_path nyu.pth --max_depth 10
11 |   ```
12 | 
13 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.


--------------------------------------------------------------------------------
/PixelFormer/configs/arguments_eval_kittieigen.txt:
--------------------------------------------------------------------------------
 1 | --model_name pixelformer_kittieigen
 2 | --encoder large07
 3 | --dataset kitti
 4 | --input_height 352
 5 | --input_width 1216
 6 | --max_depth 80
 7 | --do_kb_crop
 8 | 
 9 | --data_path_eval dataset/KITTI
10 | --gt_path_eval dataset/kitti_gt
11 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt
12 | --min_depth_eval 1e-3
13 | --max_depth_eval 80
14 | --garg_crop
15 | 
16 | --checkpoint_path pretrained/kitti.pth


--------------------------------------------------------------------------------
/PixelFormer/configs/arguments_eval_nyu.txt:
--------------------------------------------------------------------------------
 1 | --model_name pixelformer_nyu
 2 | --encoder large07
 3 | --dataset nyu
 4 | --input_height 480
 5 | --input_width 640
 6 | --max_depth 10
 7 | 
 8 | --data_path_eval datasets/nyu_depth_v2/official_splits/test/
 9 | --gt_path_eval datasets/nyu_depth_v2/official_splits/test/
10 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt
11 | --min_depth_eval 1e-3
12 | --max_depth_eval 10
13 | --eigen_crop
14 | 
15 | --checkpoint_path pretrained/nyu.pth
16 | 


--------------------------------------------------------------------------------
/PixelFormer/configs/arguments_train_kittieigen.txt:
--------------------------------------------------------------------------------
 1 | --mode train
 2 | --model_name pixelformer_kittieigen
 3 | --encoder large07
 4 | --pretrain pretrained/swin_large_patch4_window7_224_22k.pth
 5 | --dataset kitti
 6 | --data_path dataset/KITTI
 7 | --gt_path dataset/kitti_gt
 8 | --filenames_file data_splits/eigen_train_files_with_gt.txt
 9 | --batch_size 8
10 | --num_epochs 20
11 | --learning_rate 4e-5
12 | --weight_decay 1e-2
13 | --adam_eps 1e-3
14 | --num_threads 1
15 | --input_height 352
16 | --input_width 1120
17 | --max_depth 80
18 | --do_kb_crop
19 | --do_random_rotate
20 | --degree 1.0
21 | --log_directory ./models/
22 | --multiprocessing_distributed
23 | --dist_url tcp://127.0.0.1:1234
24 | 
25 | --log_freq 100
26 | --do_online_eval
27 | --eval_freq 1000
28 | --data_path_eval dataset/KITTI
29 | --gt_path_eval kitti_gt
30 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt
31 | --min_depth_eval 1e-3
32 | --max_depth_eval 80
33 | --garg_crop
34 | 


--------------------------------------------------------------------------------
/PixelFormer/configs/arguments_train_nyu.txt:
--------------------------------------------------------------------------------
 1 | --mode train
 2 | --model_name pixelformer_nyu
 3 | --encoder large07
 4 | --pretrain pretrained/swin_large_patch4_window7_224_22k.pth
 5 | --dataset nyu
 6 | --data_path datasets/nyu_depth_v2/sync/
 7 | --gt_path datasets/nyu_depth_v2/sync/
 8 | --filenames_file data_splits/nyudepthv2_train_files_with_gt_dense.txt
 9 | --batch_size 8
10 | --num_epochs 20
11 | --learning_rate 4e-5
12 | --weight_decay 1e-2
13 | --adam_eps 1e-3
14 | --num_threads 1
15 | --input_height 480
16 | --input_width 640
17 | --max_depth 10
18 | --do_random_rotate
19 | --degree 2.5
20 | --log_directory ./models/
21 | --multiprocessing_distributed
22 | --dist_url tcp://127.0.0.1:2349
23 | 
24 | --log_freq 100
25 | --do_online_eval
26 | --eval_freq 1000
27 | --data_path_eval datasets/nyu_depth_v2/official_splits/test/
28 | --gt_path_eval datasets/nyu_depth_v2/official_splits/test/
29 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt
30 | --min_depth_eval 1e-3
31 | --max_depth_eval 10
32 | --eigen_crop
33 | 


--------------------------------------------------------------------------------
/PixelFormer/data_splits/test.txt:
--------------------------------------------------------------------------------
1 | files/0007_L.jpg


--------------------------------------------------------------------------------
/PixelFormer/pixelformer/dataloaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/PixelFormer/pixelformer/dataloaders/__init__.py


--------------------------------------------------------------------------------
/PixelFormer/pixelformer/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/PixelFormer/pixelformer/networks/__init__.py


--------------------------------------------------------------------------------
/Unidepth/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper Unidepth
 2 | 
 3 | 1. Download InSpaceType eval set and put data under 'InspaceType' under the root. 
 4 | 
 5 | InSpaceType_Benchmark
 6 |   | - InSpaceType
 7 |             |- 0001.pfm
 8 |             |- 0001_L.jpg
 9 |               ....
10 |   | - Method 1
11 |   | - Method 2
12 |     ......
13 | 
14 | Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, opencv-python, xFormers
15 | 
16 | 2.
17 | 
18 |   ```
19 |    python demo.py --img-path ../InSpaceType --outdir ./vis_depth
20 |   ```
21 | 
22 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
23 |   Colored visualization in metric depth are saved under --outdir


--------------------------------------------------------------------------------
/Unidepth/assets/demo/depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/depth.png


--------------------------------------------------------------------------------
/Unidepth/assets/demo/intrinsics.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/intrinsics.npy


--------------------------------------------------------------------------------
/Unidepth/assets/demo/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/output.png


--------------------------------------------------------------------------------
/Unidepth/assets/demo/rgb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/rgb.png


--------------------------------------------------------------------------------
/Unidepth/assets/docs/nuscenes_surround.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/nuscenes_surround.gif


--------------------------------------------------------------------------------
/Unidepth/assets/docs/theoffice.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/theoffice.gif


--------------------------------------------------------------------------------
/Unidepth/assets/docs/unidepth-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/unidepth-banner.png


--------------------------------------------------------------------------------
/Unidepth/configs/config_v1_cnvnxtl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13
 4 |     },
 5 |     "training": {
 6 |     },
 7 |     "data": {
 8 |         "image_shape": [462, 616]
 9 |     },
10 |     "model": {
11 |         "name": "UniDepthV1",
12 |         "num_heads": 8,
13 |         "expansion": 4,
14 |         "pixel_decoder": {
15 |             "hidden_dim": 512,
16 |             "depths": [3, 2, 1],
17 |             "dropout": 0.0
18 |         },
19 |         "pixel_encoder": {
20 |             "name": "convnext_large",
21 |             "pretrained": null
22 |         }
23 |     }
24 | }


--------------------------------------------------------------------------------
/Unidepth/configs/config_v1_vitl14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "generic": {
 3 |         "seed": 13
 4 |     },
 5 |     "training": {},
 6 |     "data": {
 7 |         "image_shape": [462, 616]
 8 |     },
 9 |     "model": {
10 |         "name": "UniDepthV1",
11 |         "num_heads": 8,
12 |         "expansion": 4,
13 |         "pixel_decoder": {
14 |             "hidden_dim": 512,
15 |             "depths": [3, 2, 1],
16 |             "dropout": 0.0
17 |         },
18 |         "pixel_encoder": {
19 |             "name": "dinov2_vitl14",
20 |             "pretrained": null
21 |         }
22 |     }
23 | }


--------------------------------------------------------------------------------
/Unidepth/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NAME=${1}
 3 | VENV_DIR=${2}
 4 | 
 5 | python -m venv ${VENV_DIR}/${NAME}
 6 | 
 7 | source ${VENV_DIR}/${NAME}/bin/activate
 8 | 
 9 | pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
10 | pip install -e .
11 | pip install xformers==0.0.24 --index-url https://download.pytorch.org/whl/cu118
12 | 


--------------------------------------------------------------------------------
/Unidepth/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.pyright]
 6 | include = ["unidepth"]
 7 | 
 8 | [project]
 9 | name = "unidepth"
10 | version = "0.1"
11 | authors = [{name = "Luigi Piccinelli", email = "lpiccinelli@ethz.ch"}]
12 | description = "UniDepth: Universal Monocular Metric Depth Estimation"
13 | readme = "README.md"
14 | license = { text="Creatives Common BY-NC 4.0 license"}
15 | requires-python = ">=3.10.0"
16 | dynamic = ["dependencies"]
17 | 
18 | [tool.setuptools.dynamic]
19 | dependencies = {file = ["requirements.txt"]}
20 | 
21 | [tool.setuptools.package-data]
22 | "*" = ["py.typed"]
23 | 
24 | [tool.setuptools.packages.find]
25 | include = ["unidepth*"]
26 | 


--------------------------------------------------------------------------------
/Unidepth/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs
 2 | attrs
 3 | black
 4 | blosc2
 5 | botocore==1.34.54
 6 | certifi==2022.12.7
 7 | charset-normalizer
 8 | click
 9 | contourpy
10 | cycler
11 | docker-pycreds
12 | einops==0.7.0
13 | filelock
14 | flake8==7.0.0
15 | flake8-bugbear==24.2.6
16 | flake8-comprehensions==3.14.0
17 | fonttools
18 | fsspec
19 | fvcore==0.1.5.post20221221
20 | gitdb
21 | GitPython
22 | h5py>=3.10.0
23 | huggingface-hub>=0.22.0
24 | idna
25 | imageio
26 | imath
27 | iopath
28 | isort
29 | Jinja2
30 | jmespath
31 | kiwisolver
32 | MarkupSafe
33 | matplotlib
34 | mccabe
35 | mpmath
36 | msgpack
37 | mypy-extensions
38 | ndindex
39 | networkx
40 | ninja
41 | numexpr
42 | numpy
43 | opencv-python
44 | OpenEXR
45 | packaging
46 | pandas
47 | pathspec
48 | pillow==10.2.0
49 | platformdirs
50 | portalocker
51 | protobuf==4.25.3
52 | psutil
53 | py-cpuinfo
54 | pycodestyle
55 | pyflakes
56 | pyparsing
57 | python-dateutil
58 | pytz
59 | PyYAML
60 | requests
61 | safetensors
62 | scipy
63 | sentry-sdk
64 | setproctitle
65 | six
66 | smmap
67 | sympy
68 | tables
69 | tabulate
70 | termcolor
71 | timm
72 | tqdm
73 | triton==2.2.0
74 | typing_extensions
75 | tzdata==2024.1
76 | urllib3==1.26.13
77 | wandb
78 | yacs
79 | torch==2.2.0
80 | torchvision==0.17.0
81 | torchaudio==2.2.0
82 | xformers==0.0.24


--------------------------------------------------------------------------------
/Unidepth/unidepth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/unidepth/__init__.py


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .activation import SwiGLU, GEGLU
 2 | from .convnext import CvnxtBlock
 3 | from .attention import AttentionBlock, AttentionDecoderBlock
 4 | from .nystrom_attention import NystromBlock
 5 | from .positional_encoding import PositionEmbeddingSine
 6 | from .upsample import ConvUpsample, ConvUpsampleShuffle
 7 | from .mlp import MLP
 8 | 
 9 | 
10 | __all__ = [
11 |     "SwiGLU",
12 |     "GEGLU",
13 |     "CvnxtBlock",
14 |     "AttentionBlock",
15 |     "NystromBlock",
16 |     "PositionEmbeddingSine",
17 |     "ConvUpsample",
18 |     "MLP",
19 |     "ConvUpsampleShuffle",
20 |     "AttentionDecoderBlock",
21 | ]
22 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/activation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class SwiGLU(nn.Module):
 7 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 8 |         x, gates = x.chunk(2, dim=-1)
 9 |         return x * F.silu(gates)
10 | 
11 | 
12 | class GEGLU(nn.Module):
13 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
14 |         x, gates = x.chunk(2, dim=-1)
15 |         return x * F.gelu(gates)
16 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/convnext.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class CvnxtBlock(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         dim,
 9 |         kernel_size=7,
10 |         layer_scale=1.0,
11 |         expansion=4,
12 |         dilation=1,
13 |     ):
14 |         super().__init__()
15 |         self.dwconv = nn.Conv2d(
16 |             dim,
17 |             dim,
18 |             kernel_size=kernel_size,
19 |             padding="same",
20 |             groups=dim,
21 |             dilation=dilation,
22 |         )  # depthwise conv
23 |         self.norm = nn.LayerNorm(dim, eps=1e-6)
24 |         self.pwconv1 = nn.Linear(
25 |             dim, expansion * dim
26 |         )  # pointwise/1x1 convs, implemented with linear layers
27 |         self.act = nn.GELU()
28 |         self.pwconv2 = nn.Linear(expansion * dim, dim)
29 |         self.gamma = (
30 |             nn.Parameter(layer_scale * torch.ones((dim))) if layer_scale > 0.0 else 1.0
31 |         )
32 | 
33 |     def forward(self, x):
34 |         input = x
35 |         x = self.dwconv(x)
36 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
37 |         x = self.norm(x)
38 |         x = self.pwconv1(x)
39 |         x = self.act(x)
40 |         x = self.pwconv2(x)
41 | 
42 |         x = self.gamma * x
43 |         x = input + x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
44 |         return x
45 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False):
 6 |     if drop_prob == 0.0 or not training:
 7 |         return x
 8 |     keep_prob = 1 - drop_prob
 9 |     shape = (x.shape[0],) + (1,) * (
10 |         x.ndim - 1
11 |     )  # work with diff dim tensors, not just 2D ConvNets
12 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
13 |     if keep_prob > 0.0:
14 |         random_tensor.div_(keep_prob)
15 |     output = x * random_tensor
16 |     return output
17 | 
18 | 
19 | class DropPath(nn.Module):
20 |     def __init__(self, drop_prob=None):
21 |         super(DropPath, self).__init__()
22 |         self.drop_prob = drop_prob
23 | 
24 |     def forward(self, x):
25 |         return drop_path(x, self.drop_prob, self.training)
26 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LayerScale(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         dim: int,
 9 |         init_values = 1e-5,
10 |         inplace = False,
11 |     ) -> None:
12 |         super().__init__()
13 |         self.inplace = inplace
14 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
15 | 
16 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
17 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
18 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from unidepth.utils.misc import default
 5 | from .activation import SwiGLU
 6 | 
 7 | 
 8 | class MLP(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         input_dim: int,
12 |         expansion: int = 4,
13 |         dropout: float = 0.0,
14 |         gated: bool = False,
15 |         output_dim = None,
16 |     ):
17 |         super().__init__()
18 |         if gated:
19 |             expansion = int(expansion * 2 / 3)
20 |         hidden_dim = int(input_dim * expansion)
21 |         output_dim = default(output_dim, input_dim)
22 |         self.norm = nn.LayerNorm(input_dim)
23 |         self.proj1 = nn.Linear(input_dim, hidden_dim)
24 |         self.proj2 = nn.Linear(hidden_dim, output_dim)
25 |         self.act = nn.GELU() if not gated else SwiGLU()
26 |         self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
27 | 
28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
29 |         x = self.norm(x)
30 |         x = self.proj1(x)
31 |         x = self.act(x)
32 |         x = self.proj2(x)
33 |         x = self.dropout(x)
34 |         return x
35 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/__init__.py:
--------------------------------------------------------------------------------
1 | # from .unidepthv1 import UniDepthV1
2 | 
3 | # __all__ = [
4 | #     "UniDepthV1",
5 | # ]
6 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | from .convnext2 import ConvNeXtV2
 2 | from .convnext import ConvNeXt
 3 | from .dinov2 import _make_dinov2_model
 4 | 
 5 | __all__ = [
 6 |     "ConvNeXt",
 7 |     "ConvNeXtV2",
 8 |     "_make_dinov2_model",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/backbones/metadinov2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .dino_head import DINOHead
 8 | from .mlp import Mlp
 9 | from .patch_embed import PatchEmbed
10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
11 | from .block import NestedTensorBlock
12 | from .attention import MemEffAttention
13 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/backbones/metadinov2/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | import torch.nn as nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (
20 |         x.ndim - 1
21 |     )  # work with diff dim tensors, not just 2D ConvNets
22 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
23 |     if keep_prob > 0.0:
24 |         random_tensor.div_(keep_prob)
25 |     output = x * random_tensor
26 |     return output
27 | 
28 | 
29 | class DropPath(nn.Module):
30 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
31 | 
32 |     def __init__(self, drop_prob=None):
33 |         super(DropPath, self).__init__()
34 |         self.drop_prob = drop_prob
35 | 
36 |     def forward(self, x):
37 |         return drop_path(x, self.drop_prob, self.training)
38 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/backbones/metadinov2/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | import torch.nn as nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/backbones/metadinov2/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/models/unidepthv1/__init__.py:
--------------------------------------------------------------------------------
1 | # from .unidepthv1 import UniDepthV1
2 | 
3 | # __all__ = [
4 | #     "UniDepthV1",
5 | # ]
6 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | from .losses import SILog, MSE, SelfCons
 2 | from .scheduler import CosineScheduler
 3 | 
 4 | __all__ = [
 5 |     "SILog",
 6 |     "MSE",
 7 |     "SelfCons",
 8 |     "CosineScheduler",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .evaluation_depth import eval_depth, DICT_METRICS
 2 | from .visualization import colorize, image_grid, log_train_artifacts
 3 | from .misc import format_seconds, remove_padding, get_params, identity
 4 | from .distributed import (
 5 |     is_main_process,
 6 |     setup_multi_processes,
 7 |     setup_slurm,
 8 |     sync_tensor_across_gpus,
 9 |     barrier,
10 |     get_rank,
11 |     get_dist_info,
12 | )
13 | from .geometric import unproject_points, spherical_zbuffer_to_euclidean
14 | 
15 | __all__ = [
16 |     "eval_depth",
17 |     "DICT_METRICS",
18 |     "colorize",
19 |     "image_grid",
20 |     "log_train_artifacts",
21 |     "format_seconds",
22 |     "remove_padding",
23 |     "get_params",
24 |     "identity",
25 |     "is_main_process",
26 |     "setup_multi_processes",
27 |     "setup_slurm",
28 |     "sync_tensor_across_gpus",
29 |     "barrier",
30 |     "get_rank",
31 |     "unproject_points",
32 |     "spherical_zbuffer_to_euclidean",
33 |     "validate",
34 |     "get_dist_info",
35 | ]
36 | 


--------------------------------------------------------------------------------
/Unidepth/unidepth/utils/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Luigi Piccinelli
 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/)
 4 | """
 5 | 
 6 | import math
 7 | import torch
 8 | 
 9 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
10 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
11 | IMAGENET_DATASET_MEAN = (0.485, 0.456, 0.406)
12 | IMAGENET_DATASET_STD = (0.229, 0.224, 0.225)
13 | DEPTH_BINS = torch.cat(
14 |     (
15 |         torch.logspace(math.log10(0.1), math.log10(180.0), steps=512),
16 |         torch.tensor([260.0]),
17 |     ),
18 |     dim=0,
19 | )
20 | LOGERR_BINS = torch.linspace(-2, 2, steps=128 + 1)
21 | LINERR_BINS = torch.linspace(-50, 50, steps=256 + 1)
22 | 


--------------------------------------------------------------------------------
/VPD/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Wenliang Zhao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/VPD/README.md:
--------------------------------------------------------------------------------
 1 | # VPD
 2 | 
 3 | 1. Follow [VPD](https://github.com/wl-zhao/VPD) Installation section (download stable diffusion models and install stable-diffusion package). The VPD repo uses SD v1-5
 4 | 
 5 | 2. Follow [VPD depth](https://github.com/wl-zhao/VPD/blob/main/depth/README.md) first step to install mmcv and requirements. Then download [VPD depth pretrained](https://cloud.tsinghua.edu.cn/f/7e4adc76cc9b4200ac79/?dl=1) and put it under checkpoints/
 6 | 
 7 | 3. Download InSpaceType eval set and put it under root folder.
 8 | 
 9 | 4. 
10 | 
11 |   ```
12 |   cd depth
13 | 
14 |   bash test.sh ../checkpoints/vpd_depth_480x480.pth
15 |   ```
16 | 
17 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 


--------------------------------------------------------------------------------
/VPD/depth/README.md:
--------------------------------------------------------------------------------
 1 | # Depth Estimation with VPD
 2 | ## Getting Started  
 3 | 
 4 | 1. Install the [mmcv-full](https://github.com/open-mmlab/mmcv) library and some required packages.
 5 | 
 6 | ```bash
 7 | pip install openmim
 8 | mim install mmcv-full
 9 | pip install -r requirements.txt
10 | ```
11 | 
12 | 2. Prepare NYUDepthV2 datasets following [GLPDepth](https://github.com/vinvino02/GLPDepth) and [BTS](https://github.com/cleinc/bts/tree/master).
13 | 
14 | ```
15 | mkdir nyu_depth_v2
16 | wget http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat
17 | python extract_official_train_test_set_from_mat.py nyu_depth_v2_labeled.mat splits.mat ./nyu_depth_v2/official_splits/
18 | ```
19 | 
20 | Download sync.zip provided by the authors of BTS from this [url](https://drive.google.com/file/d/1AysroWpfISmm-yRFGBgFTrLy6FjQwvwP/view) and unzip in `./nyu_depth_v2` folder. 
21 | 
22 | Your dataset directory should be:
23 | 
24 | ```
25 | │nyu_depth_v2/
26 | ├──official_splits/
27 | │  ├── test
28 | │  ├── train
29 | ├──sync/
30 | ```
31 | 
32 | ## Results and Fine-tuned Models
33 | 
34 | |  | RMSE | d1 | d2 | d3 | REL  | log_10 | Fine-tuned Model |
35 | |-------------------|-------|-------|--------|--------|--------|-------|-------|
36 | | **VPD** | 0.254 | 0.964 | 0.995 | 0.999 | 0.069 | 0.030 |[Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/7e4adc76cc9b4200ac79/?dl=1) |
37 | 
38 | We offer the predicted depths in 16-bit format for NYU-Depth-v2 official test set [here](https://cloud.tsinghua.edu.cn/f/27354f47ba424bb3ad40/?dl=1).
39 | 
40 | ## Training
41 | 
42 | Run the following instuction to train the VPD-Depth model. We recommend using 8 NVIDIA V100 GPUs to train the model with a total batch size of 24. 
43 | 
44 | ```
45 | bash train.sh <LOG_DIR>
46 | ```
47 | 
48 | ## Evaluation
49 | Command format:
50 | ```
51 | bash test.sh <CHECKPOINT_PATH>
52 | ```


--------------------------------------------------------------------------------
/VPD/depth/configs/test_options.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | from configs.base_options import BaseOptions
 7 | 
 8 | class TestOptions(BaseOptions):
 9 |     def initialize(self):
10 |         parser = BaseOptions.initialize(self)
11 | 
12 |         # experiment configs
13 |         parser.add_argument('--ckpt_dir',   type=str,
14 |                     default='./ckpt/best_model_nyu.ckpt', 
15 |                     help='load ckpt path')
16 |         parser.add_argument('--result_dir', type=str, default='./results',
17 |                             help='save result images into result_dir/exp_name')
18 |         parser.add_argument('--crop_h',  type=int, default=448)
19 |         parser.add_argument('--crop_w',  type=int, default=576)       
20 | 
21 |         parser.add_argument('--save_eval_pngs', action='store_true',
22 |                             help='save result image into evaluation form')
23 |         parser.add_argument('--save_visualize', action='store_true',
24 |                             help='save result image into visulized form')
25 |         return parser
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/VPD/depth/dataset/filenames/nyudepthv2/split_files_sml.txt:
--------------------------------------------------------------------------------
 1 | /home/choyingw/Documents/ZED/1000_select_split/0001_L.jpg
 2 | /home/choyingw/Documents/ZED/1000_select_split/0002_L.jpg
 3 | /home/choyingw/Documents/ZED/1000_select_split/0003_L.jpg
 4 | /home/choyingw/Documents/ZED/1000_select_split/0004_L.jpg
 5 | /home/choyingw/Documents/ZED/1000_select_split/0005_L.jpg
 6 | /home/choyingw/Documents/ZED/1000_select_split/0006_L.jpg
 7 | /home/choyingw/Documents/ZED/1000_select_split/0007_L.jpg
 8 | /home/choyingw/Documents/ZED/1000_select_split/0008_L.jpg
 9 | /home/choyingw/Documents/ZED/1000_select_split/0009_L.jpg
10 | /home/choyingw/Documents/ZED/1000_select_split/0010_L.jpg
11 | /home/choyingw/Documents/ZED/1000_select_split/0011_L.jpg
12 | /home/choyingw/Documents/ZED/1000_select_split/0012_L.jpg
13 | /home/choyingw/Documents/ZED/1000_select_split/0013_L.jpg
14 | 


--------------------------------------------------------------------------------
/VPD/depth/dataset/imagepath.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import cv2
 8 | from torch.utils.data import Dataset
 9 | import torchvision.transforms as transforms
10 | 
11 | class imagepath(Dataset):
12 |     # for test only
13 |     def __init__(self, data_path):
14 |         super().__init__()
15 | 
16 |         self.data_path = data_path
17 |         self.to_tensor = transforms.ToTensor()
18 | 
19 |         self.filenames_list = [os.path.join(data_path, i) for i in os.listdir(data_path)
20 |                                if i.split('.')[-1] in ['jpg', 'png']]
21 | 
22 |         print("Dataset : Image Path")
23 |         print("# of images: %d" % (len(self.filenames_list)))
24 | 
25 |     def __len__(self):
26 |         return len(self.filenames_list)
27 | 
28 |     def __getitem__(self, idx):
29 |         batch = {}
30 |         file = self.filenames_list[idx]
31 |         filename = file.split('/')[-1]
32 | 
33 |         image = cv2.imread(file)  # [H x W x C] and C: BGR
34 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
35 |         
36 |         # input size should be multiple of 32
37 |         h, w, c = image.shape
38 |         new_h, new_w = h // 32 * 32, w // 32 * 32
39 |         image = cv2.resize(image, (new_w, new_h))
40 |         image = self.to_tensor(image)
41 | 
42 |         batch['image'] = image
43 |         batch['filename'] = filename
44 | 
45 |         return batch
46 | 


--------------------------------------------------------------------------------
/VPD/depth/nyu_class_embeddings.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/nyu_class_embeddings.pth


--------------------------------------------------------------------------------
/VPD/depth/nyu_class_list.json:
--------------------------------------------------------------------------------
 1 | ["printer_room", 
 2 | "bathroom", 
 3 | "living_room", 
 4 | "study", 
 5 | "conference_room", 
 6 | "study_room", 
 7 | "kitchen", 
 8 | "home_office", 
 9 | "bedroom", 
10 | "dinette", 
11 | "playroom", 
12 | "indoor_balcony", 
13 | "laundry_room", 
14 | "basement", 
15 | "excercise_room", 
16 | "foyer", 
17 | "home_storage", 
18 | "cafe", 
19 | "furniture_store", 
20 | "office_kitchen", 
21 | "student_lounge", 
22 | "dining_room", 
23 | "reception_room", 
24 | "computer_lab", 
25 | "classroom", 
26 | "office", 
27 | "bookstore"]


--------------------------------------------------------------------------------
/VPD/depth/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.6.0
2 | h5py>=3.6.0
3 | scipy>=1.7.3
4 | opencv-python>=4.5.5
5 | timm>=0.5.4
6 | albumentations>=1.1.0
7 | tensorboardX>=2.4.1
8 | gdown>=4.2.1


--------------------------------------------------------------------------------
/VPD/depth/splits.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/splits.mat


--------------------------------------------------------------------------------
/VPD/depth/src/clip/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.py[cod]
 3 | *$py.class
 4 | *.egg-info
 5 | .pytest_cache
 6 | .ipynb_checkpoints
 7 | 
 8 | thumbs.db
 9 | .DS_Store
10 | .idea
11 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include clip/bpe_simple_vocab_16e6.txt.gz
2 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/src/clip/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/VPD/depth/src/clip/data/country211.md:
--------------------------------------------------------------------------------
 1 | # The Country211 Dataset
 2 | 
 3 | In the paper, we used an image classification dataset called Country211, to evaluate the model's capability on geolocation. To do so, we filtered the YFCC100m dataset that have GPS coordinate corresponding to a [ISO-3166 country code](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) and created a balanced dataset by sampling 150 train images, 50 validation images, and 100 test images images for each country.
 4 | 
 5 | The following command will download an 11GB archive countaining the images and extract into a subdirectory `country211`:
 6 | 
 7 | ```bash
 8 | wget https://openaipublic.azureedge.net/clip/data/country211.tgz
 9 | tar zxvf country211.tgz
10 | ```
11 | 
12 | These images are a subset of the YFCC100m dataset. Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/).


--------------------------------------------------------------------------------
/VPD/depth/src/clip/data/rendered-sst2.md:
--------------------------------------------------------------------------------
 1 | # The Rendered SST2 Dataset
 2 | 
 3 | In the paper, we used an image classification dataset called Rendered SST2, to evaluate the model's capability on optical character recognition. To do so, we rendered the sentences in the [Standford Sentiment Treebank v2](https://nlp.stanford.edu/sentiment/treebank.html) dataset and used those as the input to the CLIP image encoder.
 4 | 
 5 | The following command will download a 131MB archive countaining the images and extract into a subdirectory `rendered-sst2`:
 6 | 
 7 | ```bash
 8 | wget https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz
 9 | tar zxvf rendered-sst2.tgz
10 | ```
11 | 
12 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/data/yfcc100m.md:
--------------------------------------------------------------------------------
 1 | # The YFCC100M Subset
 2 | 
 3 | In the paper, we performed a dataset ablation using a subset of the YFCC100M dataset and showed that the performance remained largely similar. 
 4 | 
 5 | The subset contains 14,829,396 images, about 15% of the full dataset, which have been filtered to only keep those with natural languag titles and/or descriptions in English.
 6 | 
 7 | We provide the list of (line number, photo identifier, photo hash) of each image contained in this subset. These correspond to the first three columns in the dataset's metadata TSV file.
 8 | 
 9 | ```bash
10 | wget https://openaipublic.azureedge.net/clip/data/yfcc100m_subset_data.tsv.bz2
11 | bunzip2 yfcc100m_subset_data.tsv.bz2
12 | ```
13 | 
14 | Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/).


--------------------------------------------------------------------------------
/VPD/depth/src/clip/hubconf.py:
--------------------------------------------------------------------------------
 1 | from clip.clip import tokenize as _tokenize, load as _load, available_models as _available_models
 2 | import re
 3 | import string
 4 | 
 5 | dependencies = ["torch", "torchvision", "ftfy", "regex", "tqdm"]
 6 | 
 7 | # For compatibility (cannot include special characters in function name)
 8 | model_functions = { model: re.sub(f'[{string.punctuation}]', '_', model) for model in _available_models()}
 9 | 
10 | def _create_hub_entrypoint(model):
11 |     def entrypoint(**kwargs):      
12 |         return _load(model, **kwargs)
13 |     
14 |     entrypoint.__doc__ = f"""Loads the {model} CLIP model
15 | 
16 |         Parameters
17 |         ----------
18 |         device : Union[str, torch.device]
19 |             The device to put the loaded model
20 | 
21 |         jit : bool
22 |             Whether to load the optimized JIT model or more hackable non-JIT model (default).
23 | 
24 |         download_root: str
25 |             path to download the model files; by default, it uses "~/.cache/clip"
26 | 
27 |         Returns
28 |         -------
29 |         model : torch.nn.Module
30 |             The {model} CLIP model
31 | 
32 |         preprocess : Callable[[PIL.Image], torch.Tensor]
33 |             A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
34 |         """
35 |     return entrypoint
36 | 
37 | def tokenize():
38 |     return _tokenize
39 | 
40 | _entrypoints = {model_functions[model]: _create_hub_entrypoint(model) for model in _available_models()}
41 | 
42 | globals().update(_entrypoints)


--------------------------------------------------------------------------------
/VPD/depth/src/clip/requirements.txt:
--------------------------------------------------------------------------------
1 | ftfy
2 | regex
3 | tqdm
4 | torch
5 | torchvision
6 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pkg_resources
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(
 7 |     name="clip",
 8 |     py_modules=["clip"],
 9 |     version="1.0",
10 |     description="",
11 |     author="OpenAI",
12 |     packages=find_packages(exclude=["tests*"]),
13 |     install_requires=[
14 |         str(r)
15 |         for r in pkg_resources.parse_requirements(
16 |             open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
17 |         )
18 |     ],
19 |     include_package_data=True,
20 |     extras_require={'dev': ['pytest']},
21 | )
22 | 


--------------------------------------------------------------------------------
/VPD/depth/src/clip/tests/test_consistency.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import torch
 4 | from PIL import Image
 5 | 
 6 | import clip
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('model_name', clip.available_models())
10 | def test_consistency(model_name):
11 |     device = "cpu"
12 |     jit_model, transform = clip.load(model_name, device=device, jit=True)
13 |     py_model, _ = clip.load(model_name, device=device, jit=False)
14 | 
15 |     image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
16 |     text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
17 | 
18 |     with torch.no_grad():
19 |         logits_per_image, _ = jit_model(image, text)
20 |         jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
21 | 
22 |         logits_per_image, _ = py_model(image, text)
23 |         py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
24 | 
25 |     assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
26 | 


--------------------------------------------------------------------------------
/VPD/depth/test.sh:
--------------------------------------------------------------------------------
 1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
 2 | python3 -m torch.distributed.launch --nproc_per_node=1 \
 3 | --use_env test.py --dataset nyudepthv2 --data_path ./ \
 4 |  --max_depth 10.0 --max_depth_eval 10.0 \
 5 |  --num_filters 32 32 32 --deconv_kernels 2 2 2\
 6 |  --flip_test --shift_window_test\
 7 |  --shift_size 2 --ckpt_dir $1\
 8 |  --crop_h 480 --crop_w 480 ${@:2}
 9 | 
10 | 
11 | #  


--------------------------------------------------------------------------------
/VPD/depth/train.sh:
--------------------------------------------------------------------------------
1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
2 | python3 -m torch.distributed.launch --nproc_per_node=8 \
3 | --use_env train.py --batch_size 3 --dataset nyudepthv2 --data_path ./ \
4 |  --max_depth 10.0 --max_depth_eval 10.0 --weight_decay 0.1 \
5 |  --num_filters 32 32 32 --deconv_kernels 2 2 2\
6 |  --flip_test --shift_window_test \
7 |  --shift_size 2 --save_model --layer_decay 0.9 --drop_path_rate 0.3 --log_dir $1 \
8 |   --crop_h 480 --crop_w 480 --epochs 25 ${@:2}


--------------------------------------------------------------------------------
/VPD/depth/utils_depth/criterion.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth).
 3 | # For non-commercial purpose only (research, evaluation etc).
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | class SiLogLoss(nn.Module):
11 |     def __init__(self, lambd=0.5):
12 |         super().__init__()
13 |         self.lambd = lambd
14 | 
15 |     def forward(self, pred, target):
16 |         valid_mask = (target > 0).detach()
17 |         diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
18 |         loss = torch.sqrt(torch.pow(diff_log, 2).mean() -
19 |                           self.lambd * torch.pow(diff_log.mean(), 2))
20 | 
21 |         return loss
22 | 
23 | 


--------------------------------------------------------------------------------
/VPD/refer/models_refer/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .model import VPDRefer


--------------------------------------------------------------------------------
/VPD/refer/refer/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	# install pycocotools/mask locally
3 | 	# copy from https://github.com/pdollar/coco.git
4 | 	python setup.py build_ext --inplace
5 | 	rm -rf build
6 | 
7 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'licheng'
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/readme.txt:
--------------------------------------------------------------------------------
 1 | This folder contains modified coco-caption evaluation, which is downloaded from https://github.com/tylin/coco-caption.git
 2 | and refEvaluation which is to be called by the refer algorithm.
 3 | 
 4 | More specifically, this folder contains:
 5 | 1. bleu/
 6 | 2. cider/
 7 | 3. meteor/
 8 | 4. rouge/
 9 | 5. tokenizer/
10 | 6. __init__.py
11 | 7. refEvaluation.py
12 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/refer/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------
/VPD/refer/refer/external/README.md:
--------------------------------------------------------------------------------
1 | The codes inside this folder are copied from pycocotools: https://github.com/pdollar/coco


--------------------------------------------------------------------------------
/VPD/refer/refer/external/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/VPD/refer/refer/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | from distutils.extension import Extension
 4 | import numpy as np
 5 | 
 6 | ext_modules = [
 7 |             Extension(
 8 |                 'external._mask',
 9 |                 sources=['external/maskApi.c', 'external/_mask.pyx'],
10 |                 include_dirs = [np.get_include(), 'external'],
11 |                 extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
12 |             )
13 |         ]
14 | 
15 | setup(
16 |     name='external',
17 |     packages=['external'],
18 |     package_dir = {'external': 'external'},
19 |     version='2.0',
20 |     ext_modules=cythonize(ext_modules)
21 |     )
22 | 


--------------------------------------------------------------------------------
/VPD/refer/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | filelock
 3 | tqdm
 4 | timm
 5 | ftfy
 6 | regex
 7 | scipy
 8 | scikit-image
 9 | pycocotools==2.0.2
10 | opencv-python==4.5.3.56
11 | tokenizers
12 | h5py


--------------------------------------------------------------------------------
/VPD/refer/test.sh:
--------------------------------------------------------------------------------
1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
2 | python3 test.py  \
3 | --dataset $1 --split val --resume $2 \
4 | --workers 4 --ddp_trained_weights --img_size 512 ${@:3}


--------------------------------------------------------------------------------
/VPD/refer/train.sh:
--------------------------------------------------------------------------------
1 | logdir=$2
2 | mkdir -p $logdir
3 | 
4 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
5 | python3 -m torch.distributed.launch --nproc_per_node $3 --master_port 12345 train.py \
6 | --dataset $1 --model_id $1 \
7 | --batch-size 4 --lr 0.00005 --wd 1e-2 \
8 | --epochs 40 --img_size 512 ${@:4} \
9 | 2>&1 | tee $logdir/log.txt


--------------------------------------------------------------------------------
/VPD/segmentation/README.md:
--------------------------------------------------------------------------------
 1 | # Semantic Segmentation with VPD
 2 | ## Getting Started 
 3 | 
 4 | 1. Install the [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) library and some required packages.
 5 | 
 6 | ```bash
 7 | pip install openmim
 8 | mim install mmcv-full
 9 | mim install mmsegmentation
10 | ```
11 | 
12 | 2. Follow the guide in [mmseg](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/dataset_prepare.md) to prepare the ADE20k dataset.
13 | 
14 | 
15 | ## Results and Fine-tuned Models
16 | 
17 | | Model | Config | Head | Crop Size | Lr Schd | mIoU | mIoU (ms+flip)  | Fine-tuned Model |
18 | |:---:|:---:|:---:|:---:|:---:| :---:|:---:|:---:|
19 | | ```VPDSeg_SD-1-5``` | [config](configs/fpn_vpd_sd1-5_512x512_gpu8x2.py) | Semantic FPN | 512x512 | 80K | 53.7 | 54.6 | [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/78ca31e53c5549779abd/?dl=1) |
20 | 
21 | ## Training
22 | ```
23 | bash dist_train.sh <CONFIG_PATH> <NUM_GPUS>
24 | ```
25 | We use 8 GPUs by default.
26 | 
27 | ## Evaluation
28 | Command format:
29 | ```
30 | bash dist_test.sh <CONFIG_PATH> <CHECKPOINT_PATH> <NUM_GPUS> --eval mIoU
31 | ```
32 | To evaluate a model with multi-scale and flip, run
33 | ```
34 | bash dist_test.sh <CONFIG_PATH> <CHECKPOINT_PATH> <NUM_GPUS> --eval mIoU --aug-test
35 | ```
36 | 


--------------------------------------------------------------------------------
/VPD/segmentation/class_embeddings.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/segmentation/class_embeddings.pth


--------------------------------------------------------------------------------
/VPD/segmentation/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         # dict(type='TensorboardLoggerHook')
 7 |     ])
 8 | # yapf:enable
 9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 | find_unused_parameters = True
16 | 
17 | 


--------------------------------------------------------------------------------
/VPD/segmentation/configs/_base_/models/fpn_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained='open-mmlab://resnet50_v1c',
 6 |     backbone=dict(
 7 |         type='ResNetV1c',
 8 |         depth=50,
 9 |         num_stages=4,
10 |         out_indices=(0, 1, 2, 3),
11 |         dilations=(1, 1, 1, 1),
12 |         strides=(1, 2, 2, 2),
13 |         norm_cfg=norm_cfg,
14 |         norm_eval=False,
15 |         style='pytorch',
16 |         contract_dilation=True),
17 |     neck=dict(
18 |         type='FPN',
19 |         in_channels=[256, 512, 1024, 2048],
20 |         out_channels=256,
21 |         num_outs=4),
22 |     decode_head=dict(
23 |         type='FPNHead',
24 |         in_channels=[256, 256, 256, 256],
25 |         in_index=[0, 1, 2, 3],
26 |         feature_strides=[4, 8, 16, 32],
27 |         channels=256,
28 |         dropout_ratio=0.1,
29 |         num_classes=19,
30 |         norm_cfg=norm_cfg,
31 |         align_corners=False,
32 |         loss_decode=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34 |     # model training and testing settings
35 |     train_cfg=dict(),
36 |     test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341))
37 |     )


--------------------------------------------------------------------------------
/VPD/segmentation/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-5, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/VPD/segmentation/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-6, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/VPD/segmentation/configs/fpn_vpd_sd1-5_512x512_gpu8x2.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/fpn_r50.py', '_base_/datasets/ade20k_vpd.py',
 3 |     '_base_/default_runtime.py', '_base_/schedules/schedule_80k.py'
 4 | ]
 5 | 
 6 | model = dict(
 7 |     type='VPDSeg',
 8 |     sd_path='checkpoints/v1-5-pruned-emaonly.ckpt',
 9 |     neck=dict(
10 |         type='FPN',
11 |         in_channels=[320, 790, 1430, 1280],
12 |         out_channels=256,
13 |         num_outs=4),
14 |     decode_head=dict(
15 |         type='FPNHead',
16 |         num_classes=150,
17 |         loss_decode=dict(
18 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
19 | )
20 | 
21 | lr_config = dict(policy='poly', power=1, min_lr=0.0, by_epoch=False,
22 |                 warmup='linear',
23 |                  warmup_iters=1500,
24 |                  warmup_ratio=1e-6)
25 | 
26 | 
27 | optimizer = dict(type='AdamW', lr=0.00008, weight_decay=0.001,
28 |         paramwise_cfg=dict(custom_keys={'unet': dict(lr_mult=0.1),
29 |                                         'encoder_vq': dict(lr_mult=0.0),
30 |                                         'text_encoder': dict(lr_mult=0.0),
31 |                                         'norm': dict(decay_mult=0.)}))
32 | 
33 | data = dict(samples_per_gpu=2, workers_per_gpu=8)
34 | 


--------------------------------------------------------------------------------
/VPD/segmentation/dist_test.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | CHECKPOINT=$2
 3 | GPUS=$3
 4 | NNODES=${NNODES:-1}
 5 | NODE_RANK=${NODE_RANK:-0}
 6 | PORT=${PORT:-29500}
 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 8 | 
 9 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
10 | python3 -m torch.distributed.launch \
11 |     --nnodes=$NNODES \
12 |     --node_rank=$NODE_RANK \
13 |     --master_addr=$MASTER_ADDR \
14 |     --nproc_per_node=$GPUS \
15 |     --master_port=$PORT \
16 |     $(dirname "$0")/test.py \
17 |     $CONFIG \
18 |     $CHECKPOINT \
19 |     --launcher pytorch \
20 |     ${@:4}
21 | 


--------------------------------------------------------------------------------
/VPD/segmentation/dist_train.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | GPUS=$2
 3 | NNODES=${NNODES:-1}
 4 | NODE_RANK=${NODE_RANK:-0}
 5 | PORT=${PORT:-29500}
 6 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 7 | 
 8 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \
 9 | python3 -m torch.distributed.launch \
10 |     --nnodes=$NNODES \
11 |     --node_rank=$NODE_RANK \
12 |     --master_addr=$MASTER_ADDR \
13 |     --nproc_per_node=$GPUS \
14 |     --master_port=$PORT \
15 |     $(dirname "$0")/train.py \
16 |     $CONFIG \
17 |     --launcher pytorch ${@:3}
18 | 


--------------------------------------------------------------------------------
/VPD/segmentation/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .vpd_seg import VPDSeg


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_16x16x16.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 16
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 16
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [16]
24 |       dropout: 0.0
25 | 
26 | 
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 12
31 |     wrap: True
32 |     train:
33 |       target: ldm.data.imagenet.ImageNetSRTrain
34 |       params:
35 |         size: 256
36 |         degradation: pil_nearest
37 |     validation:
38 |       target: ldm.data.imagenet.ImageNetSRValidation
39 |       params:
40 |         size: 256
41 |         degradation: pil_nearest
42 | 
43 | lightning:
44 |   callbacks:
45 |     image_logger:
46 |       target: main.ImageLogger
47 |       params:
48 |         batch_frequency: 1000
49 |         max_images: 8
50 |         increase_log_steps: True
51 | 
52 |   trainer:
53 |     benchmark: True
54 |     accumulate_grad_batches: 2
55 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_32x32x4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 4
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 4
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [ ]
24 |       dropout: 0.0
25 | 
26 | data:
27 |   target: main.DataModuleFromConfig
28 |   params:
29 |     batch_size: 12
30 |     wrap: True
31 |     train:
32 |       target: ldm.data.imagenet.ImageNetSRTrain
33 |       params:
34 |         size: 256
35 |         degradation: pil_nearest
36 |     validation:
37 |       target: ldm.data.imagenet.ImageNetSRValidation
38 |       params:
39 |         size: 256
40 |         degradation: pil_nearest
41 | 
42 | lightning:
43 |   callbacks:
44 |     image_logger:
45 |       target: main.ImageLogger
46 |       params:
47 |         batch_frequency: 1000
48 |         max_images: 8
49 |         increase_log_steps: True
50 | 
51 |   trainer:
52 |     benchmark: True
53 |     accumulate_grad_batches: 2
54 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_64x64x3.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 3
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 3
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [ ]
24 |       dropout: 0.0
25 | 
26 | 
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 12
31 |     wrap: True
32 |     train:
33 |       target: ldm.data.imagenet.ImageNetSRTrain
34 |       params:
35 |         size: 256
36 |         degradation: pil_nearest
37 |     validation:
38 |       target: ldm.data.imagenet.ImageNetSRValidation
39 |       params:
40 |         size: 256
41 |         degradation: pil_nearest
42 | 
43 | lightning:
44 |   callbacks:
45 |     image_logger:
46 |       target: main.ImageLogger
47 |       params:
48 |         batch_frequency: 1000
49 |         max_images: 8
50 |         increase_log_steps: True
51 | 
52 |   trainer:
53 |     benchmark: True
54 |     accumulate_grad_batches: 2
55 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_8x8x64.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 64
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 64
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [16,8]
24 |       dropout: 0.0
25 | 
26 | data:
27 |   target: main.DataModuleFromConfig
28 |   params:
29 |     batch_size: 12
30 |     wrap: True
31 |     train:
32 |       target: ldm.data.imagenet.ImageNetSRTrain
33 |       params:
34 |         size: 256
35 |         degradation: pil_nearest
36 |     validation:
37 |       target: ldm.data.imagenet.ImageNetSRValidation
38 |       params:
39 |         size: 256
40 |         degradation: pil_nearest
41 | 
42 | lightning:
43 |   callbacks:
44 |     image_logger:
45 |       target: main.ImageLogger
46 |       params:
47 |         batch_frequency: 1000
48 |         max_images: 8
49 |         increase_log_steps: True
50 | 
51 |   trainer:
52 |     benchmark: True
53 |     accumulate_grad_batches: 2
54 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/latent-diffusion/cin256-v2.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 0.0001
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss
17 |     use_ema: False
18 |     
19 |     unet_config:
20 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
21 |       params:
22 |         image_size: 64
23 |         in_channels: 3
24 |         out_channels: 3
25 |         model_channels: 192
26 |         attention_resolutions:
27 |         - 8
28 |         - 4
29 |         - 2
30 |         num_res_blocks: 2
31 |         channel_mult:
32 |         - 1
33 |         - 2
34 |         - 3
35 |         - 5
36 |         num_heads: 1
37 |         use_spatial_transformer: true
38 |         transformer_depth: 1
39 |         context_dim: 512
40 |     
41 |     first_stage_config:
42 |       target: ldm.models.autoencoder.VQModelInterface
43 |       params:
44 |         embed_dim: 3
45 |         n_embed: 8192
46 |         ddconfig:
47 |           double_z: false
48 |           z_channels: 3
49 |           resolution: 256
50 |           in_channels: 3
51 |           out_ch: 3
52 |           ch: 128
53 |           ch_mult:
54 |           - 1
55 |           - 2
56 |           - 4
57 |           num_res_blocks: 2
58 |           attn_resolutions: []
59 |           dropout: 0.0
60 |         lossconfig:
61 |           target: torch.nn.Identity
62 |     
63 |     cond_stage_config:
64 |       target: ldm.modules.encoders.modules.ClassEmbedder
65 |       params:
66 |         n_classes: 1001
67 |         embed_dim: 512
68 |         key: class_label
69 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 5.0e-05
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.00085
 6 |     linear_end: 0.012
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: caption
12 |     image_size: 32
13 |     channels: 4
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     scale_factor: 0.18215
18 |     use_ema: False
19 | 
20 |     unet_config:
21 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
22 |       params:
23 |         image_size: 32
24 |         in_channels: 4
25 |         out_channels: 4
26 |         model_channels: 320
27 |         attention_resolutions:
28 |         - 4
29 |         - 2
30 |         - 1
31 |         num_res_blocks: 2
32 |         channel_mult:
33 |         - 1
34 |         - 2
35 |         - 4
36 |         - 4
37 |         num_heads: 8
38 |         use_spatial_transformer: true
39 |         transformer_depth: 1
40 |         context_dim: 1280
41 |         use_checkpoint: true
42 |         legacy: False
43 | 
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.AutoencoderKL
46 |       params:
47 |         embed_dim: 4
48 |         monitor: val/rec_loss
49 |         ddconfig:
50 |           double_z: true
51 |           z_channels: 4
52 |           resolution: 256
53 |           in_channels: 3
54 |           out_ch: 3
55 |           ch: 128
56 |           ch_mult:
57 |           - 1
58 |           - 2
59 |           - 4
60 |           - 4
61 |           num_res_blocks: 2
62 |           attn_resolutions: []
63 |           dropout: 0.0
64 |         lossconfig:
65 |           target: torch.nn.Identity
66 | 
67 |     cond_stage_config:
68 |       target: ldm.modules.encoders.modules.BERTEmbedder
69 |       params:
70 |         n_embed: 1280
71 |         n_layer: 32
72 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/configs/retrieval-augmented-diffusion/768x768.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 0.0001
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.015
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: jpg
11 |     cond_stage_key: nix
12 |     image_size: 48
13 |     channels: 16
14 |     cond_stage_trainable: false
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     scale_by_std: false
18 |     scale_factor: 0.22765929
19 |     unet_config:
20 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
21 |       params:
22 |         image_size: 48
23 |         in_channels: 16
24 |         out_channels: 16
25 |         model_channels: 448
26 |         attention_resolutions:
27 |         - 4
28 |         - 2
29 |         - 1
30 |         num_res_blocks: 2
31 |         channel_mult:
32 |         - 1
33 |         - 2
34 |         - 3
35 |         - 4
36 |         use_scale_shift_norm: false
37 |         resblock_updown: false
38 |         num_head_channels: 32
39 |         use_spatial_transformer: true
40 |         transformer_depth: 1
41 |         context_dim: 768
42 |         use_checkpoint: true
43 |     first_stage_config:
44 |       target: ldm.models.autoencoder.AutoencoderKL
45 |       params:
46 |         monitor: val/rec_loss
47 |         embed_dim: 16
48 |         ddconfig:
49 |           double_z: true
50 |           z_channels: 16
51 |           resolution: 256
52 |           in_channels: 3
53 |           out_ch: 3
54 |           ch: 128
55 |           ch_mult:
56 |           - 1
57 |           - 1
58 |           - 2
59 |           - 2
60 |           - 4
61 |           num_res_blocks: 2
62 |           attn_resolutions:
63 |           - 16
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: torch.nn.Identity
67 |     cond_stage_config:
68 |       target: torch.nn.Identity


--------------------------------------------------------------------------------
/VPD/stable-diffusion/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: ldm
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.8.5
 7 |   - pip=20.3
 8 |   - cudatoolkit=11.3
 9 |   - pytorch=1.11.0
10 |   - torchvision=0.12.0
11 |   - numpy=1.19.2
12 |   - pip:
13 |     - albumentations==0.4.3
14 |     - diffusers
15 |     - opencv-python==4.1.2.30
16 |     - pudb==2019.2
17 |     - invisible-watermark
18 |     - imageio==2.9.0
19 |     - imageio-ffmpeg==0.4.2
20 |     - pytorch-lightning==1.4.2
21 |     - omegaconf==2.1.1
22 |     - test-tube>=0.7.5
23 |     - streamlit>=0.73.1
24 |     - einops==0.3.0
25 |     - torch-fidelity==0.3.0
26 |     - transformers==4.19.2
27 |     - torchmetrics==0.6.0
28 |     - kornia==0.6
29 |     - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
30 |     - -e git+https://github.com/openai/CLIP.git@main#egg=clip
31 |     - -e .
32 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/data/__init__.py


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/data/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
 3 | 
 4 | 
 5 | class Txt2ImgIterableBaseDataset(IterableDataset):
 6 |     '''
 7 |     Define an interface to make the IterableDatasets for text2img data chainable
 8 |     '''
 9 |     def __init__(self, num_records=0, valid_ids=None, size=256):
10 |         super().__init__()
11 |         self.num_records = num_records
12 |         self.valid_ids = valid_ids
13 |         self.sample_ids = valid_ids
14 |         self.size = size
15 | 
16 |         print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
17 | 
18 |     def __len__(self):
19 |         return self.num_records
20 | 
21 |     @abstractmethod
22 |     def __iter__(self):
23 |         pass


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/models/diffusion/dpm_solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import DPMSolverSampler


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/VPD/stable-diffusion/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/kl-f16/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 16
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 16
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 1
23 |       - 2
24 |       - 2
25 |       - 4
26 |       num_res_blocks: 2
27 |       attn_resolutions:
28 |       - 16
29 |       dropout: 0.0
30 | data:
31 |   target: main.DataModuleFromConfig
32 |   params:
33 |     batch_size: 6
34 |     wrap: true
35 |     train:
36 |       target: ldm.data.openimages.FullOpenImagesTrain
37 |       params:
38 |         size: 384
39 |         crop_size: 256
40 |     validation:
41 |       target: ldm.data.openimages.FullOpenImagesValidation
42 |       params:
43 |         size: 384
44 |         crop_size: 256
45 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/kl-f32/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 64
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 64
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 1
23 |       - 2
24 |       - 2
25 |       - 4
26 |       - 4
27 |       num_res_blocks: 2
28 |       attn_resolutions:
29 |       - 16
30 |       - 8
31 |       dropout: 0.0
32 | data:
33 |   target: main.DataModuleFromConfig
34 |   params:
35 |     batch_size: 6
36 |     wrap: true
37 |     train:
38 |       target: ldm.data.openimages.FullOpenImagesTrain
39 |       params:
40 |         size: 384
41 |         crop_size: 256
42 |     validation:
43 |       target: ldm.data.openimages.FullOpenImagesValidation
44 |       params:
45 |         size: 384
46 |         crop_size: 256
47 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/kl-f4/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 3
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 3
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 2
23 |       - 4
24 |       num_res_blocks: 2
25 |       attn_resolutions: []
26 |       dropout: 0.0
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 10
31 |     wrap: true
32 |     train:
33 |       target: ldm.data.openimages.FullOpenImagesTrain
34 |       params:
35 |         size: 384
36 |         crop_size: 256
37 |     validation:
38 |       target: ldm.data.openimages.FullOpenImagesValidation
39 |       params:
40 |         size: 384
41 |         crop_size: 256
42 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/kl-f8/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 4
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 4
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 2
23 |       - 4
24 |       - 4
25 |       num_res_blocks: 2
26 |       attn_resolutions: []
27 |       dropout: 0.0
28 | data:
29 |   target: main.DataModuleFromConfig
30 |   params:
31 |     batch_size: 4
32 |     wrap: true
33 |     train:
34 |       target: ldm.data.openimages.FullOpenImagesTrain
35 |       params:
36 |         size: 384
37 |         crop_size: 256
38 |     validation:
39 |       target: ldm.data.openimages.FullOpenImagesValidation
40 |       params:
41 |         size: 384
42 |         crop_size: 256
43 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/vq-f16/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 8
 6 |     n_embed: 16384
 7 |     ddconfig:
 8 |       double_z: false
 9 |       z_channels: 8
10 |       resolution: 256
11 |       in_channels: 3
12 |       out_ch: 3
13 |       ch: 128
14 |       ch_mult:
15 |       - 1
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 16
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 250001
30 |         disc_weight: 0.75
31 |         disc_num_layers: 2
32 |         codebook_weight: 1.0
33 | 
34 | data:
35 |   target: main.DataModuleFromConfig
36 |   params:
37 |     batch_size: 14
38 |     num_workers: 20
39 |     wrap: true
40 |     train:
41 |       target: ldm.data.openimages.FullOpenImagesTrain
42 |       params:
43 |         size: 384
44 |         crop_size: 256
45 |     validation:
46 |       target: ldm.data.openimages.FullOpenImagesValidation
47 |       params:
48 |         size: 384
49 |         crop_size: 256
50 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/vq-f4-noattn/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 3
 6 |     n_embed: 8192
 7 |     monitor: val/rec_loss
 8 | 
 9 |     ddconfig:
10 |       attn_type: none
11 |       double_z: false
12 |       z_channels: 3
13 |       resolution: 256
14 |       in_channels: 3
15 |       out_ch: 3
16 |       ch: 128
17 |       ch_mult:
18 |       - 1
19 |       - 2
20 |       - 4
21 |       num_res_blocks: 2
22 |       attn_resolutions: []
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 11
30 |         disc_weight: 0.75
31 |         codebook_weight: 1.0
32 | 
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 8
37 |     num_workers: 12
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         crop_size: 256
43 |     validation:
44 |       target: ldm.data.openimages.FullOpenImagesValidation
45 |       params:
46 |         crop_size: 256
47 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/vq-f4/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 3
 6 |     n_embed: 8192
 7 |     monitor: val/rec_loss
 8 | 
 9 |     ddconfig:
10 |       double_z: false
11 |       z_channels: 3
12 |       resolution: 256
13 |       in_channels: 3
14 |       out_ch: 3
15 |       ch: 128
16 |       ch_mult:
17 |       - 1
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions: []
22 |       dropout: 0.0
23 |     lossconfig:
24 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
25 |       params:
26 |         disc_conditional: false
27 |         disc_in_channels: 3
28 |         disc_start: 0
29 |         disc_weight: 0.75
30 |         codebook_weight: 1.0
31 | 
32 | data:
33 |   target: main.DataModuleFromConfig
34 |   params:
35 |     batch_size: 8
36 |     num_workers: 16
37 |     wrap: true
38 |     train:
39 |       target: ldm.data.openimages.FullOpenImagesTrain
40 |       params:
41 |         crop_size: 256
42 |     validation:
43 |       target: ldm.data.openimages.FullOpenImagesValidation
44 |       params:
45 |         crop_size: 256
46 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/vq-f8-n256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 4
 6 |     n_embed: 256
 7 |     monitor: val/rec_loss
 8 |     ddconfig:
 9 |       double_z: false
10 |       z_channels: 4
11 |       resolution: 256
12 |       in_channels: 3
13 |       out_ch: 3
14 |       ch: 128
15 |       ch_mult:
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 32
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 250001
30 |         disc_weight: 0.75
31 |         codebook_weight: 1.0
32 | 
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 10
37 |     num_workers: 20
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         size: 384
43 |         crop_size: 256
44 |     validation:
45 |       target: ldm.data.openimages.FullOpenImagesValidation
46 |       params:
47 |         size: 384
48 |         crop_size: 256
49 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/first_stage_models/vq-f8/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 4
 6 |     n_embed: 16384
 7 |     monitor: val/rec_loss
 8 |     ddconfig:
 9 |       double_z: false
10 |       z_channels: 4
11 |       resolution: 256
12 |       in_channels: 3
13 |       out_ch: 3
14 |       ch: 128
15 |       ch_mult:
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 32
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_num_layers: 2
30 |         disc_start: 1
31 |         disc_weight: 0.6
32 |         codebook_weight: 1.0
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 10
37 |     num_workers: 20
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         size: 384
43 |         crop_size: 256
44 |     validation:
45 |       target: ldm.data.openimages.FullOpenImagesValidation
46 |       params:
47 |         size: 384
48 |         crop_size: 256
49 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/ldm/celeba256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 48
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.faceshq.CelebAHQTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.faceshq.CelebAHQValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/ldm/ffhq256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 42
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.faceshq.FFHQTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.faceshq.FFHQValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/ldm/inpainting_big/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: masked_image
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     monitor: val/loss
16 |     scheduler_config:
17 |       target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
18 |       params:
19 |         verbosity_interval: 0
20 |         warm_up_steps: 1000
21 |         max_decay_steps: 50000
22 |         lr_start: 0.001
23 |         lr_max: 0.1
24 |         lr_min: 0.0001
25 |     unet_config:
26 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
27 |       params:
28 |         image_size: 64
29 |         in_channels: 7
30 |         out_channels: 3
31 |         model_channels: 256
32 |         attention_resolutions:
33 |         - 8
34 |         - 4
35 |         - 2
36 |         num_res_blocks: 2
37 |         channel_mult:
38 |         - 1
39 |         - 2
40 |         - 3
41 |         - 4
42 |         num_heads: 8
43 |         resblock_updown: true
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.VQModelInterface
46 |       params:
47 |         embed_dim: 3
48 |         n_embed: 8192
49 |         monitor: val/rec_loss
50 |         ddconfig:
51 |           attn_type: none
52 |           double_z: false
53 |           z_channels: 3
54 |           resolution: 256
55 |           in_channels: 3
56 |           out_ch: 3
57 |           ch: 128
58 |           ch_mult:
59 |           - 1
60 |           - 2
61 |           - 4
62 |           num_res_blocks: 2
63 |           attn_resolutions: []
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: ldm.modules.losses.contperceptual.DummyLoss
67 |     cond_stage_config: __is_first_stage__
68 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/ldm/lsun_beds256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 48
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.lsun.LSUNBedroomsTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.lsun.LSUNBedroomsValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/models/ldm/semantic_synthesis256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: segmentation
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     cond_stage_trainable: true
16 |     unet_config:
17 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
18 |       params:
19 |         image_size: 64
20 |         in_channels: 6
21 |         out_channels: 3
22 |         model_channels: 128
23 |         attention_resolutions:
24 |         - 32
25 |         - 16
26 |         - 8
27 |         num_res_blocks: 2
28 |         channel_mult:
29 |         - 1
30 |         - 4
31 |         - 8
32 |         num_heads: 8
33 |     first_stage_config:
34 |       target: ldm.models.autoencoder.VQModelInterface
35 |       params:
36 |         embed_dim: 3
37 |         n_embed: 8192
38 |         ddconfig:
39 |           double_z: false
40 |           z_channels: 3
41 |           resolution: 256
42 |           in_channels: 3
43 |           out_ch: 3
44 |           ch: 128
45 |           ch_mult:
46 |           - 1
47 |           - 2
48 |           - 4
49 |           num_res_blocks: 2
50 |           attn_resolutions: []
51 |           dropout: 0.0
52 |         lossconfig:
53 |           target: torch.nn.Identity
54 |     cond_stage_config:
55 |       target: ldm.modules.encoders.modules.SpatialRescaler
56 |       params:
57 |         n_stages: 2
58 |         in_channels: 182
59 |         out_channels: 3
60 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/scripts/download_first_stages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip
 3 | wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip
 4 | wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip
 5 | wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip
 6 | wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip
 7 | wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip
 8 | wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip
 9 | wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip
10 | wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip
11 | 
12 | 
13 | 
14 | cd models/first_stage_models/kl-f4
15 | unzip -o model.zip
16 | 
17 | cd ../kl-f8
18 | unzip -o model.zip
19 | 
20 | cd ../kl-f16
21 | unzip -o model.zip
22 | 
23 | cd ../kl-f32
24 | unzip -o model.zip
25 | 
26 | cd ../vq-f4
27 | unzip -o model.zip
28 | 
29 | cd ../vq-f4-noattn
30 | unzip -o model.zip
31 | 
32 | cd ../vq-f8
33 | unzip -o model.zip
34 | 
35 | cd ../vq-f8-n256
36 | unzip -o model.zip
37 | 
38 | cd ../vq-f16
39 | unzip -o model.zip
40 | 
41 | cd ../..


--------------------------------------------------------------------------------
/VPD/stable-diffusion/scripts/download_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip
 3 | wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip
 4 | wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip
 5 | wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip
 6 | wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip
 7 | wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip
 8 | wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip
 9 | wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip
10 | wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip
11 | wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip
12 | wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip
13 | 
14 | 
15 | 
16 | cd models/ldm/celeba256
17 | unzip -o celeba-256.zip
18 | 
19 | cd ../ffhq256
20 | unzip -o ffhq-256.zip
21 | 
22 | cd ../lsun_churches256
23 | unzip -o lsun_churches-256.zip
24 | 
25 | cd ../lsun_beds256
26 | unzip -o lsun_beds-256.zip
27 | 
28 | cd ../text2img256
29 | unzip -o model.zip
30 | 
31 | cd ../cin256
32 | unzip -o model.zip
33 | 
34 | cd ../semantic_synthesis512
35 | unzip -o model.zip
36 | 
37 | cd ../semantic_synthesis256
38 | unzip -o model.zip
39 | 
40 | cd ../bsr_sr
41 | unzip -o model.zip
42 | 
43 | cd ../layout2img-openimages256
44 | unzip -o model.zip
45 | 
46 | cd ../inpainting_big
47 | unzip -o model.zip
48 | 
49 | cd ../..
50 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/scripts/tests/test_watermark.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import fire
 3 | from imwatermark import WatermarkDecoder
 4 | 
 5 | 
 6 | def testit(img_path):
 7 |     bgr = cv2.imread(img_path)
 8 |     decoder = WatermarkDecoder('bytes', 136)
 9 |     watermark = decoder.decode(bgr, 'dwtDct')
10 |     try:
11 |         dec = watermark.decode('utf-8')
12 |     except:
13 |         dec = "null"
14 |     print(dec)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     fire.Fire(testit)


--------------------------------------------------------------------------------
/VPD/stable-diffusion/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='latent-diffusion',
 5 |     version='0.0.1',
 6 |     description='',
 7 |     packages=find_packages(),
 8 |     install_requires=[
 9 |         'torch',
10 |         'numpy',
11 |         'tqdm',
12 |     ],
13 | )


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/License.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
15 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
16 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
19 | OR OTHER DEALINGS IN THE SOFTWARE./
20 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/coco_cond_stage.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: taming.models.vqgan.VQSegmentationModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     image_key: "segmentation"
 8 |     n_labels: 183
 9 |     ddconfig:
10 |       double_z: false
11 |       z_channels: 256
12 |       resolution: 256
13 |       in_channels: 183
14 |       out_ch: 183
15 |       ch: 128
16 |       ch_mult:
17 |       - 1
18 |       - 1
19 |       - 2
20 |       - 2
21 |       - 4
22 |       num_res_blocks: 2
23 |       attn_resolutions:
24 |       - 16
25 |       dropout: 0.0
26 | 
27 |     lossconfig:
28 |       target: taming.modules.losses.segmentation.BCELossWithQuant
29 |       params:
30 |         codebook_weight: 1.0
31 | 
32 | data:
33 |   target: main.DataModuleFromConfig
34 |   params:
35 |     batch_size: 12
36 |     train:
37 |       target: taming.data.coco.CocoImagesAndCaptionsTrain
38 |       params:
39 |         size: 296
40 |         crop_size: 256
41 |         onehot_segmentation: true
42 |         use_stuffthing: true
43 |     validation:
44 |       target: taming.data.coco.CocoImagesAndCaptionsValidation
45 |       params:
46 |         size: 256
47 |         crop_size: 256
48 |         onehot_segmentation: true
49 |         use_stuffthing: true
50 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/custom_vqgan.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: taming.models.vqgan.VQModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     ddconfig:
 8 |       double_z: False
 9 |       z_channels: 256
10 |       resolution: 256
11 |       in_channels: 3
12 |       out_ch: 3
13 |       ch: 128
14 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
15 |       num_res_blocks: 2
16 |       attn_resolutions: [16]
17 |       dropout: 0.0
18 | 
19 |     lossconfig:
20 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21 |       params:
22 |         disc_conditional: False
23 |         disc_in_channels: 3
24 |         disc_start: 10000
25 |         disc_weight: 0.8
26 |         codebook_weight: 1.0
27 | 
28 | data:
29 |   target: main.DataModuleFromConfig
30 |   params:
31 |     batch_size: 5
32 |     num_workers: 8
33 |     train:
34 |       target: taming.data.custom.CustomTrain
35 |       params:
36 |         training_images_list_file: some/training.txt
37 |         size: 256
38 |     validation:
39 |       target: taming.data.custom.CustomTest
40 |       params:
41 |         test_images_list_file: some/test.txt
42 |         size: 256
43 | 
44 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/faceshq_transformer.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: taming.models.cond_transformer.Net2NetTransformer
 4 |   params:
 5 |     cond_stage_key: coord
 6 |     transformer_config:
 7 |       target: taming.modules.transformer.mingpt.GPT
 8 |       params:
 9 |         vocab_size: 1024
10 |         block_size: 512
11 |         n_layer: 24
12 |         n_head: 16
13 |         n_embd: 1024
14 |     first_stage_config:
15 |       target: taming.models.vqgan.VQModel
16 |       params:
17 |         ckpt_path: logs/2020-11-09T13-33-36_faceshq_vqgan/checkpoints/last.ckpt
18 |         embed_dim: 256
19 |         n_embed: 1024
20 |         ddconfig:
21 |           double_z: false
22 |           z_channels: 256
23 |           resolution: 256
24 |           in_channels: 3
25 |           out_ch: 3
26 |           ch: 128
27 |           ch_mult:
28 |           - 1
29 |           - 1
30 |           - 2
31 |           - 2
32 |           - 4
33 |           num_res_blocks: 2
34 |           attn_resolutions:
35 |           - 16
36 |           dropout: 0.0
37 |         lossconfig:
38 |           target: taming.modules.losses.DummyLoss
39 |     cond_stage_config:
40 |       target: taming.modules.misc.coord.CoordStage
41 |       params:
42 |         n_embed: 1024
43 |         down_factor: 16
44 | 
45 | data:
46 |   target: main.DataModuleFromConfig
47 |   params:
48 |     batch_size: 2
49 |     num_workers: 8
50 |     train:
51 |       target: taming.data.faceshq.FacesHQTrain
52 |       params:
53 |         size: 256
54 |         crop_size: 256
55 |         coord: True
56 |     validation:
57 |       target: taming.data.faceshq.FacesHQValidation
58 |       params:
59 |         size: 256
60 |         crop_size: 256
61 |         coord: True
62 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/faceshq_vqgan.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: taming.models.vqgan.VQModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     ddconfig:
 8 |       double_z: False
 9 |       z_channels: 256
10 |       resolution: 256
11 |       in_channels: 3
12 |       out_ch: 3
13 |       ch: 128
14 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
15 |       num_res_blocks: 2
16 |       attn_resolutions: [16]
17 |       dropout: 0.0
18 | 
19 |     lossconfig:
20 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21 |       params:
22 |         disc_conditional: False
23 |         disc_in_channels: 3
24 |         disc_start: 30001
25 |         disc_weight: 0.8
26 |         codebook_weight: 1.0
27 | 
28 | data:
29 |   target: main.DataModuleFromConfig
30 |   params:
31 |     batch_size: 3
32 |     num_workers: 8
33 |     train:
34 |       target: taming.data.faceshq.FacesHQTrain
35 |       params:
36 |         size: 256
37 |         crop_size: 256
38 |     validation:
39 |       target: taming.data.faceshq.FacesHQValidation
40 |       params:
41 |         size: 256
42 |         crop_size: 256
43 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/imagenet_vqgan.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: taming.models.vqgan.VQModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     ddconfig:
 8 |       double_z: False
 9 |       z_channels: 256
10 |       resolution: 256
11 |       in_channels: 3
12 |       out_ch: 3
13 |       ch: 128
14 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
15 |       num_res_blocks: 2
16 |       attn_resolutions: [16]
17 |       dropout: 0.0
18 | 
19 |     lossconfig:
20 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21 |       params:
22 |         disc_conditional: False
23 |         disc_in_channels: 3
24 |         disc_start: 250001
25 |         disc_weight: 0.8
26 |         codebook_weight: 1.0
27 | 
28 | data:
29 |   target: main.DataModuleFromConfig
30 |   params:
31 |     batch_size: 12
32 |     num_workers: 24
33 |     train:
34 |       target: taming.data.imagenet.ImageNetTrain
35 |       params:
36 |         config:
37 |           size: 256
38 |     validation:
39 |       target: taming.data.imagenet.ImageNetValidation
40 |       params:
41 |         config:
42 |           size: 256
43 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/imagenetdepth_vqgan.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: taming.models.vqgan.VQModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     image_key: depth
 8 |     ddconfig:
 9 |       double_z: False
10 |       z_channels: 256
11 |       resolution: 256
12 |       in_channels: 1
13 |       out_ch: 1
14 |       ch: 128
15 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
16 |       num_res_blocks: 2
17 |       attn_resolutions: [16]
18 |       dropout: 0.0
19 | 
20 |     lossconfig:
21 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
22 |       params:
23 |         disc_conditional: False
24 |         disc_in_channels: 1
25 |         disc_start: 50001
26 |         disc_weight: 0.75
27 |         codebook_weight: 1.0
28 | 
29 | data:
30 |   target: main.DataModuleFromConfig
31 |   params:
32 |     batch_size: 3
33 |     num_workers: 8
34 |     train:
35 |       target: taming.data.imagenet.ImageNetTrainWithDepth
36 |       params:
37 |         size: 256
38 |     validation:
39 |       target: taming.data.imagenet.ImageNetValidationWithDepth
40 |       params:
41 |         size: 256
42 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/configs/sflckr_cond_stage.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: taming.models.vqgan.VQSegmentationModel
 4 |   params:
 5 |     embed_dim: 256
 6 |     n_embed: 1024
 7 |     image_key: "segmentation"
 8 |     n_labels: 182
 9 |     ddconfig:
10 |       double_z: false
11 |       z_channels: 256
12 |       resolution: 256
13 |       in_channels: 182
14 |       out_ch: 182
15 |       ch: 128
16 |       ch_mult:
17 |       - 1
18 |       - 1
19 |       - 2
20 |       - 2
21 |       - 4
22 |       num_res_blocks: 2
23 |       attn_resolutions:
24 |       - 16
25 |       dropout: 0.0
26 | 
27 |     lossconfig:
28 |       target: taming.modules.losses.segmentation.BCELossWithQuant
29 |       params:
30 |         codebook_weight: 1.0
31 | 
32 | data:
33 |   target: cutlit.DataModuleFromConfig
34 |   params:
35 |     batch_size: 12
36 |     train:
37 |       target: taming.data.sflckr.Examples # adjust
38 |       params:
39 |         size: 256
40 |     validation:
41 |       target: taming.data.sflckr.Examples # adjust
42 |       params:
43 |         size: 256
44 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: taming
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.8.5
 7 |   - pip=20.3
 8 |   - cudatoolkit=10.2
 9 |   - pytorch=1.7.0
10 |   - torchvision=0.8.1
11 |   - numpy=1.19.2
12 |   - pip:
13 |     - albumentations==0.4.3
14 |     - opencv-python==4.1.2.30
15 |     - pudb==2019.2
16 |     - imageio==2.9.0
17 |     - imageio-ffmpeg==0.4.2
18 |     - pytorch-lightning==1.0.8
19 |     - omegaconf==2.0.0
20 |     - test-tube>=0.7.5
21 |     - streamlit>=0.73.1
22 |     - einops==0.3.0
23 |     - more-itertools>=8.0.0
24 |     - transformers==4.3.1
25 |     - -e .
26 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/scripts/extract_submodel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | 
 4 | if __name__ == "__main__":
 5 |     inpath = sys.argv[1]
 6 |     outpath = sys.argv[2]
 7 |     submodel = "cond_stage_model"
 8 |     if len(sys.argv) > 3:
 9 |         submodel = sys.argv[3]
10 | 
11 |     print("Extracting {} from {} to {}.".format(submodel, inpath, outpath))
12 | 
13 |     sd = torch.load(inpath, map_location="cpu")
14 |     new_sd = {"state_dict": dict((k.split(".", 1)[-1],v)
15 |                                  for k,v in sd["state_dict"].items()
16 |                                  if k.startswith("cond_stage_model"))}
17 |     torch.save(new_sd, outpath)
18 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='taming-transformers',
 5 |     version='0.0.1',
 6 |     description='Taming Transformers for High-Resolution Image Synthesis',
 7 |     packages=find_packages(),
 8 |     install_requires=[
 9 |         'torch',
10 |         'numpy',
11 |         'tqdm',
12 |     ],
13 | )
14 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/data/custom.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import albumentations
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex
 7 | 
 8 | 
 9 | class CustomBase(Dataset):
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__()
12 |         self.data = None
13 | 
14 |     def __len__(self):
15 |         return len(self.data)
16 | 
17 |     def __getitem__(self, i):
18 |         example = self.data[i]
19 |         return example
20 | 
21 | 
22 | 
23 | class CustomTrain(CustomBase):
24 |     def __init__(self, size, training_images_list_file):
25 |         super().__init__()
26 |         with open(training_images_list_file, "r") as f:
27 |             paths = f.read().splitlines()
28 |         self.data = ImagePaths(paths=paths, size=size, random_crop=False)
29 | 
30 | 
31 | class CustomTest(CustomBase):
32 |     def __init__(self, size, test_images_list_file):
33 |         super().__init__()
34 |         with open(test_images_list_file, "r") as f:
35 |             paths = f.read().splitlines()
36 |         self.data = ImagePaths(paths=paths, size=size, random_crop=False)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/data/helper_types.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Tuple, Optional, NamedTuple, Union
 2 | from PIL.Image import Image as pil_image
 3 | from torch import Tensor
 4 | 
 5 | try:
 6 |   from typing import Literal
 7 | except ImportError:
 8 |   from typing_extensions import Literal
 9 | 
10 | Image = Union[Tensor, pil_image]
11 | BoundingBox = Tuple[float, float, float, float]  # x0, y0, w, h
12 | CropMethodType = Literal['none', 'random', 'center', 'random-2d']
13 | SplitType = Literal['train', 'validation', 'test']
14 | 
15 | 
16 | class ImageDescription(NamedTuple):
17 |     id: int
18 |     file_name: str
19 |     original_size: Tuple[int, int]  # w, h
20 |     url: Optional[str] = None
21 |     license: Optional[int] = None
22 |     coco_url: Optional[str] = None
23 |     date_captured: Optional[str] = None
24 |     flickr_url: Optional[str] = None
25 |     flickr_id: Optional[str] = None
26 |     coco_id: Optional[str] = None
27 | 
28 | 
29 | class Category(NamedTuple):
30 |     id: str
31 |     super_category: Optional[str]
32 |     name: str
33 | 
34 | 
35 | class Annotation(NamedTuple):
36 |     area: float
37 |     image_id: str
38 |     bbox: BoundingBox
39 |     category_no: int
40 |     category_id: str
41 |     id: Optional[int] = None
42 |     source: Optional[str] = None
43 |     confidence: Optional[float] = None
44 |     is_group_of: Optional[bool] = None
45 |     is_truncated: Optional[bool] = None
46 |     is_occluded: Optional[bool] = None
47 |     is_depiction: Optional[bool] = None
48 |     is_inside: Optional[bool] = None
49 |     segmentation: Optional[Dict] = None
50 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n):
33 |         return self.schedule(n)
34 | 
35 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/models/dummy_cond_stage.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | 
 3 | 
 4 | class DummyCondStage:
 5 |     def __init__(self, conditional_key):
 6 |         self.conditional_key = conditional_key
 7 |         self.train = None
 8 | 
 9 |     def eval(self):
10 |         return self
11 | 
12 |     @staticmethod
13 |     def encode(c: Tensor):
14 |         return c, None, (None, None, c)
15 | 
16 |     @staticmethod
17 |     def decode(c: Tensor):
18 |         return c
19 | 
20 |     @staticmethod
21 |     def to_rgb(c: Tensor):
22 |         return c
23 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from taming.modules.losses.vqperceptual import DummyLoss
2 | 
3 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/modules/losses/segmentation.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class BCELoss(nn.Module):
 6 |     def forward(self, prediction, target):
 7 |         loss = F.binary_cross_entropy_with_logits(prediction,target)
 8 |         return loss, {}
 9 | 
10 | 
11 | class BCELossWithQuant(nn.Module):
12 |     def __init__(self, codebook_weight=1.):
13 |         super().__init__()
14 |         self.codebook_weight = codebook_weight
15 | 
16 |     def forward(self, qloss, target, prediction, split):
17 |         bce_loss = F.binary_cross_entropy_with_logits(prediction,target)
18 |         loss = bce_loss + self.codebook_weight*qloss
19 |         return loss, {"{}/total_loss".format(split): loss.clone().detach().mean(),
20 |                       "{}/bce_loss".format(split): bce_loss.detach().mean(),
21 |                       "{}/quant_loss".format(split): qloss.detach().mean()
22 |                       }
23 | 


--------------------------------------------------------------------------------
/VPD/stable-diffusion/src/taming-transformers/taming/modules/misc/coord.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class CoordStage(object):
 4 |     def __init__(self, n_embed, down_factor):
 5 |         self.n_embed = n_embed
 6 |         self.down_factor = down_factor
 7 | 
 8 |     def eval(self):
 9 |         return self
10 | 
11 |     def encode(self, c):
12 |         """fake vqmodel interface"""
13 |         assert 0.0 <= c.min() and c.max() <= 1.0
14 |         b,ch,h,w = c.shape
15 |         assert ch == 1
16 | 
17 |         c = torch.nn.functional.interpolate(c, scale_factor=1/self.down_factor,
18 |                                             mode="area")
19 |         c = c.clamp(0.0, 1.0)
20 |         c = self.n_embed*c
21 |         c_quant = c.round()
22 |         c_ind = c_quant.to(dtype=torch.long)
23 | 
24 |         info = None, None, c_ind
25 |         return c_quant, None, info
26 | 
27 |     def decode(self, c):
28 |         c = c/self.n_embed
29 |         c = torch.nn.functional.interpolate(c, scale_factor=self.down_factor,
30 |                                             mode="nearest")
31 |         return c
32 | 


--------------------------------------------------------------------------------
/VPD/vpd/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import UNetWrapper, TextAdapter


--------------------------------------------------------------------------------
/ZoeDepth/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ZoeDepth/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on paper ZoeDepth
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python
 4 | 
 5 | 2.
 6 | 
 7 |   ```  
 8 |    python demo.py -i split_files.txt -o outputs/
 9 |   ```
10 | 
11 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
12 | 


--------------------------------------------------------------------------------
/ZoeDepth/assets/zoedepth-teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/ZoeDepth/assets/zoedepth-teaser.png


--------------------------------------------------------------------------------
/ZoeDepth/environment.yml:
--------------------------------------------------------------------------------
 1 | name: zoe
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 |   - conda-forge
 6 | dependencies:
 7 |   - cuda=11.7.1
 8 |   - h5py=3.7.0
 9 |   - hdf5=1.12.2
10 |   - matplotlib=3.6.2
11 |   - matplotlib-base=3.6.2
12 |   - numpy=1.24.1
13 |   - opencv=4.6.0
14 |   - pip=22.3.1
15 |   - python=3.9.7
16 |   - pytorch=1.13.1
17 |   - pytorch-cuda=11.7
18 |   - pytorch-mutex=1.0
19 |   - scipy=1.10.0
20 |   - torchaudio=0.13.1
21 |   - torchvision=0.14.1
22 |   - pip:
23 |     - huggingface-hub==0.11.1
24 |     - timm==0.6.12
25 |     - tqdm==4.64.1
26 |     - wandb==0.13.9
27 | 


--------------------------------------------------------------------------------
/ZoeDepth/sanity_hub.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | import torch
26 | import numpy as np
27 | from torchvision.transforms import ToTensor
28 | from PIL import Image
29 | from zoedepth.utils.misc import get_image_from_url, colorize
30 | 
31 | from zoedepth.models.builder import build_model
32 | from zoedepth.utils.config import get_config
33 | from pprint import pprint
34 | 
35 | 
36 | 
37 | # Trigger reload of MiDaS
38 | torch.hub.help("intel-isl/MiDaS", "DPT_BEiT_L_384", force_reload=True) 
39 | 
40 | 
41 | model = torch.hub.load(".", "ZoeD_K", source="local", pretrained=True)
42 | model = torch.hub.load(".", "ZoeD_NK", source="local", pretrained=True)
43 | model = torch.hub.load(".", "ZoeD_N", source="local", pretrained=True)
44 | 


--------------------------------------------------------------------------------
/ZoeDepth/ui/ui_requirements.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | trimesh==3.9.42


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/base_models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/zoedepth/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_v1 import ZoeDepth 
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepth,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/zoedepth/config_zoedepth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "name": "ZoeDepth",
 4 |         "version_name": "v1",
 5 |         "n_bins": 64,
 6 |         "bin_embedding_dim": 128,
 7 |         "bin_centers_type": "softplus",
 8 |         "n_attractors":[16, 8, 4, 1],
 9 |         "attractor_alpha": 1000,
10 |         "attractor_gamma": 2,
11 |         "attractor_kind" : "mean",
12 |         "attractor_type" : "inv",
13 |         "midas_model_type" : "DPT_BEiT_L_384",
14 |         "min_temp": 0.0212,
15 |         "max_temp": 50.0,
16 |         "output_distribution": "logbinomial",
17 |         "memory_efficient": true,
18 |         "inverse_midas": false,
19 |         "img_size": [384, 512]
20 |     },
21 |     
22 |     "train": {
23 |         "train_midas": true,
24 |         "use_pretrained_midas": true,
25 |         "trainer": "zoedepth",
26 |         "epochs": 5,
27 |         "bs": 16,
28 |         "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
29 |         "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
30 |         "same_lr": false,
31 |         "w_si": 1,
32 |         "w_domain": 0.2,
33 |         "w_reg": 0,
34 |         "w_grad": 0,
35 |         "avoid_boundary": false,
36 |         "random_crop": false,
37 |         "input_width": 640,
38 |         "input_height": 480,
39 |         "midas_lr_factor": 1,
40 |         "encoder_lr_factor":10,
41 |         "pos_enc_lr_factor":10,
42 |         "freeze_midas_bn": true
43 | 
44 |     },
45 | 
46 |     "infer":{
47 |         "train_midas": false,
48 |         "use_pretrained_midas": false,
49 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
50 |         "force_keep_ar": true
51 |     },
52 | 
53 |     "eval":{
54 |         "train_midas": false,
55 |         "use_pretrained_midas": false,
56 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
57 |     }
58 | }


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/zoedepth/config_zoedepth_kitti.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "bin_centers_type": "normed",
 4 |         "img_size": [384, 768]
 5 |     },
 6 |     
 7 |     "train": {
 8 |     },
 9 | 
10 |     "infer":{
11 |         "train_midas": false,
12 |         "use_pretrained_midas": false,
13 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt",
14 |         "force_keep_ar": true
15 |     },
16 | 
17 |     "eval":{
18 |         "train_midas": false,
19 |         "use_pretrained_midas": false,
20 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
21 |     }
22 | }


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/models/zoedepth_nk/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_nk_v1 import ZoeDepthNK
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepthNK,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/ZoeDepth/zoedepth/utils/arg_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def infer_type(x):  # hacky way to infer type from string args
 4 |     if not isinstance(x, str):
 5 |         return x
 6 | 
 7 |     try:
 8 |         x = int(x)
 9 |         return x
10 |     except ValueError:
11 |         pass
12 | 
13 |     try:
14 |         x = float(x)
15 |         return x
16 |     except ValueError:
17 |         pass
18 | 
19 |     return x
20 | 
21 | 
22 | def parse_unknown(unknown_args):
23 |     clean = []
24 |     for a in unknown_args:
25 |         if "=" in a:
26 |             k, v = a.split("=")
27 |             clean.extend([k, v])
28 |         else:
29 |             clean.append(a)
30 | 
31 |     keys = clean[::2]
32 |     values = clean[1::2]
33 |     return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}
34 | 


--------------------------------------------------------------------------------
/bts/README.md:
--------------------------------------------------------------------------------
 1 | # Benchamrk on From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation   
 2 | 
 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX
 4 | 
 5 | 2. Download pretrained model 'bts_nyu_v2_pytorch_densenet161.zip' from [Official Link](https://cogaplex-bts.s3.ap-northeast-2.amazonaws.com/bts_nyu_v2_pytorch_densenet161.zip) and extract under 'model'
 6 | 
 7 | 3.
 8 |   cd pytorch
 9 |   
10 |   ```
11 |   python bts_test.py  --dataset nyu --filenames_file ../train_test_inputs/split_files.txt --checkpoint_path models/bts_nyu_v2_pytorch_densenet161/model --max_depth 10 --encoder densenet161_bts --model_name bts_nyu_v2_pytorch_densenet161
12 |   ```
13 | 
14 |   The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition.
15 | 
16 | 


--------------------------------------------------------------------------------
/bts/pytorch/run_bts_live_3d.sh:
--------------------------------------------------------------------------------
1 | python3 bts_live_3d.py --model_name bts_nyu_v2_pytorch_densenet161 --encoder densenet161_bts --checkpoint_path ./models/bts_nyu_v2_pytorch_densenet161/model --max_depth 10 --input_height 480 --input_width 640
2 | 


--------------------------------------------------------------------------------
/bts/utils/download_from_gdrive.py:
--------------------------------------------------------------------------------
 1 | # Source: https://stackoverflow.com/a/39225039
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def download_file_from_google_drive(id, destination):
 7 |     def get_confirm_token(response):
 8 |         for key, value in response.cookies.items():
 9 |             if key.startswith('download_warning'):
10 |                 return value
11 | 
12 |         return None
13 | 
14 |     def save_response_content(response, destination):
15 |         CHUNK_SIZE = 32768
16 | 
17 |         with open(destination, "wb") as f:
18 |             for chunk in response.iter_content(CHUNK_SIZE):
19 |                 if chunk: # filter out keep-alive new chunks
20 |                     f.write(chunk)
21 | 
22 |     URL = "https://docs.google.com/uc?export=download"
23 | 
24 |     session = requests.Session()
25 | 
26 |     response = session.get(URL, params = { 'id' : id }, stream = True)
27 |     token = get_confirm_token(response)
28 | 
29 |     if token:
30 |         params = { 'id' : id, 'confirm' : token }
31 |         response = session.get(URL, params = params, stream = True)
32 | 
33 |     save_response_content(response, destination)    
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     import sys
38 |     if len(sys.argv) is not 3:
39 |         print("Usage: python google_drive.py drive_file_id destination_file_path")
40 |     else:
41 |         # TAKE ID FROM SHAREABLE LINK
42 |         file_id = sys.argv[1]
43 |         # DESTINATION FILE ON YOUR DISK
44 |         destination = sys.argv[2]
45 |         download_file_from_google_drive(file_id, destination)
46 | 


--------------------------------------------------------------------------------
/bts/utils/splits.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/bts/utils/splits.mat


--------------------------------------------------------------------------------
/pics/dataset-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/dataset-1.png


--------------------------------------------------------------------------------
/pics/dataset-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/dataset-2.png


--------------------------------------------------------------------------------
/pics/fitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/fitting.png


--------------------------------------------------------------------------------
/pics/group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/group.png


--------------------------------------------------------------------------------
/pics/heirarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/heirarchy.png


--------------------------------------------------------------------------------
/pics/mitigation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/mitigation.png


--------------------------------------------------------------------------------
/pics/overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/overall.png


--------------------------------------------------------------------------------
/pics/type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/type.png


--------------------------------------------------------------------------------
/space_type_def.yml:
--------------------------------------------------------------------------------
 1 | H0:
 2 | 1: Household Space
 3 | 2: Workspace
 4 | 3: Campus
 5 | 4: Functional Space
 6 | 
 7 | H1:
 8 | 1: Private
 9 | 2: Office
10 | 3: Hallway
11 | 4: Lounge
12 | 5: Meeting
13 | 6: Large
14 | 7: Classroom
15 | 8: Library
16 | 9: Kitchen
17 | 10: Playroom
18 | 11: Living
19 | 12: Bathroom
20 | 
21 | H2:
22 | 1: Hotel Room
23 | 2: Western-style Bedroom
24 | 3: Eastern-style Bedroom
25 | 4: Storage Room
26 | 5: Dressing Room
27 | 6: Entrance
28 | 7: Private Chamber
29 | 8: Lab Space
30 | 9: Mail Room
31 | 10: Eastern-style Workspace
32 | 11: Narrow Hallway
33 | 12: Wider Hallway
34 | 13: Lounge
35 | 14: Meeting Room
36 | 15: Auditorium
37 | 16: Reception Desk
38 | 17: Banquet Room
39 | 18: Auditorium Entryway
40 | 19: Classroom
41 | 20: Study Space
42 | 21: Bookshelf
43 | 22: Asian-style Kitchen
44 | 23: Playroom
45 | 24: Asian-style Living Room
46 | 25: American-Style Living Room
47 | 26: Bathroom


--------------------------------------------------------------------------------