├── .gitignore ├── Adabins ├── README.md ├── dataloader.py ├── demo.py ├── evaluate.py ├── infer.py ├── loss.py ├── model_io.py ├── models │ ├── __init__.py │ ├── layers.py │ ├── miniViT.py │ └── unet_adaptive_bins.py ├── train.py └── utils.py ├── DPT ├── README.md ├── dpt │ ├── __init__.py │ ├── base_model.py │ ├── blocks.py │ ├── midas_net.py │ ├── models.py │ ├── transforms.py │ └── vit.py ├── requirements.txt ├── run_monodepth.py └── util │ ├── __init__.py │ ├── io.py │ ├── misc.py │ └── pallete.py ├── Decompose ├── README.md ├── dataloader.py ├── demo.py ├── eval.py ├── evaluation.py ├── models │ ├── ViT.py │ ├── models.py │ └── upsample.py ├── split_files.txt └── utils.py ├── DepthAnything ├── LICENSE ├── README.md ├── app.py ├── controlnet │ ├── README.md │ └── config.json ├── depth_anything │ ├── blocks.py │ ├── dpt.py │ └── util │ │ └── transform.py ├── metric_depth │ ├── checkpoints │ │ └── .placeholder │ ├── demo.py │ ├── environment.yml │ ├── evaluate.py │ ├── train_mix.py │ ├── train_mono.py │ ├── train_test_inputs │ │ ├── kitti_eigen_test_files_with_gt.txt │ │ ├── kitti_eigen_train_files_with_gt.txt │ │ ├── nyudepthv2_test_files_with_gt.txt │ │ └── nyudepthv2_train_files_with_gt.txt │ └── zoedepth │ │ ├── data │ │ ├── __init__.py │ │ ├── data_mono.py │ │ ├── ddad.py │ │ ├── diml_indoor_test.py │ │ ├── diml_outdoor_test.py │ │ ├── diode.py │ │ ├── hypersim.py │ │ ├── ibims.py │ │ ├── preprocess.py │ │ ├── sun_rgbd_loader.py │ │ ├── transforms.py │ │ ├── vkitti.py │ │ └── vkitti2.py │ │ ├── models │ │ ├── __init__.py │ │ ├── base_models │ │ │ ├── __init__.py │ │ │ ├── depth_anything.py │ │ │ ├── dpt_dinov2 │ │ │ │ ├── blocks.py │ │ │ │ └── dpt.py │ │ │ └── midas.py │ │ ├── builder.py │ │ ├── depth_model.py │ │ ├── layers │ │ │ ├── attractor.py │ │ │ ├── dist_layers.py │ │ │ ├── localbins_layers.py │ │ │ └── patch_transformer.py │ │ ├── model_io.py │ │ ├── zoedepth │ │ │ ├── __init__.py │ │ │ ├── config_zoedepth.json │ │ │ ├── config_zoedepth_kitti.json │ │ │ └── zoedepth_v1.py │ │ └── zoedepth_nk │ │ │ ├── __init__.py │ │ │ ├── config_zoedepth_nk.json │ │ │ └── zoedepth_nk_v1.py │ │ ├── trainers │ │ ├── base_trainer.py │ │ ├── builder.py │ │ ├── loss.py │ │ ├── zoedepth_nk_trainer.py │ │ └── zoedepth_trainer.py │ │ └── utils │ │ ├── __init__.py │ │ ├── arg_utils.py │ │ ├── config.py │ │ ├── easydict │ │ └── __init__.py │ │ ├── geometry.py │ │ └── misc.py ├── requirements.txt ├── run.py ├── run_video.py ├── semseg │ ├── README.md │ ├── config │ │ └── depth_anything │ │ │ ├── depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py │ │ │ ├── depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py │ │ │ └── depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py │ └── dinov2.py └── torchhub │ ├── README.md │ └── facebookresearch_dinov2_main │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── LICENSE │ ├── MODEL_CARD.md │ ├── README.md │ ├── conda.yaml │ ├── dinov2 │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── eval │ │ │ ├── vitb14_pretrain.yaml │ │ │ ├── vitg14_pretrain.yaml │ │ │ ├── vitl14_pretrain.yaml │ │ │ └── vits14_pretrain.yaml │ │ ├── ssl_default_config.yaml │ │ └── train │ │ │ ├── vitg14.yaml │ │ │ ├── vitl14.yaml │ │ │ └── vitl16_short.yaml │ ├── data │ │ ├── __init__.py │ │ ├── adapters.py │ │ ├── augmentations.py │ │ ├── collate.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── decoders.py │ │ │ ├── extended.py │ │ │ ├── image_net.py │ │ │ └── image_net_22k.py │ │ ├── loaders.py │ │ ├── masking.py │ │ ├── samplers.py │ │ └── transforms.py │ ├── distributed │ │ └── __init__.py │ ├── eval │ │ ├── __init__.py │ │ ├── knn.py │ │ ├── linear.py │ │ ├── log_regression.py │ │ ├── metrics.py │ │ ├── setup.py │ │ └── utils.py │ ├── fsdp │ │ └── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── block.py │ │ ├── dino_head.py │ │ ├── drop_path.py │ │ ├── layer_scale.py │ │ ├── mlp.py │ │ ├── patch_embed.py │ │ └── swiglu_ffn.py │ ├── logging │ │ ├── __init__.py │ │ └── helpers.py │ ├── loss │ │ ├── __init__.py │ │ ├── dino_clstoken_loss.py │ │ ├── ibot_patch_loss.py │ │ └── koleo_loss.py │ ├── models │ │ ├── __init__.py │ │ └── vision_transformer.py │ ├── run │ │ ├── __init__.py │ │ ├── eval │ │ │ ├── knn.py │ │ │ ├── linear.py │ │ │ └── log_regression.py │ │ ├── submit.py │ │ └── train │ │ │ └── train.py │ ├── train │ │ ├── __init__.py │ │ ├── ssl_meta_arch.py │ │ └── train.py │ └── utils │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── config.py │ │ ├── dtype.py │ │ ├── param_groups.py │ │ └── utils.py │ ├── hubconf.py │ ├── pyproject.toml │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── scripts │ └── lint.sh │ ├── setup.cfg │ ├── setup.py │ ├── utils.py │ └── vision_transformer.py ├── DistDepth ├── README.md ├── demo.py ├── layers.py ├── networks │ ├── __init__.py │ ├── depth_decoder.py │ ├── pose_decoder.py │ └── resnet_encoder.py ├── options.py ├── split_files.txt └── utils.py ├── GLPDepth ├── README.md └── demo_glpn.py ├── InSpaceType_meta.csv ├── IronDepth ├── README.md ├── data │ ├── dataloader_custom.py │ └── dataloader_custom_rev.py ├── models │ ├── IronDepth.py │ └── submodules │ │ ├── DNET.py │ │ ├── D_submodules.py │ │ └── Dr_submodules.py ├── models_normal │ ├── NNET.py │ └── submodules.py ├── requirements.txt ├── test.py └── utils │ └── utils.py ├── MIM ├── README.md ├── configs │ ├── base_options.py │ ├── test_options.py │ └── train_options.py ├── dataset │ ├── base_dataset.py │ ├── filenames │ │ ├── eigen_benchmark │ │ │ ├── test_list.txt │ │ │ └── train_list.txt │ │ └── nyudepthv2 │ │ │ ├── split_files.txt │ │ │ ├── test_list.txt │ │ │ └── train_list.txt │ ├── imagepath.py │ ├── kitti.py │ └── nyudepthv2.py ├── models │ ├── checkpoint.py │ ├── model.py │ ├── optimizer.py │ └── swin_transformer_v2.py ├── requirements.txt ├── test.py └── utils │ ├── criterion.py │ ├── extract_official_train_test_set_from_mat.py │ ├── logging.py │ └── metrics.py ├── NeWCRFs ├── README.md ├── configs │ ├── arguments_eval_kittieigen.txt │ ├── arguments_eval_nyu.txt │ ├── arguments_train_kittieigen.txt │ └── arguments_train_nyu.txt ├── data_splits │ ├── eigen_test_files_with_gt.txt │ ├── eigen_train_files_with_gt.txt │ ├── kitti_depth_prediction_train.txt │ ├── kitti_official_test.txt │ ├── kitti_official_valid.txt │ ├── nyudepthv2_test_files_with_gt.txt │ ├── nyudepthv2_train_files_with_gt_dense.txt │ ├── split_files.txt │ └── test.txt └── newcrfs │ ├── dataloaders │ ├── __init__.py │ ├── dataloader.py │ └── dataloader_kittipred.py │ ├── demo.py │ ├── eval.py │ ├── networks │ ├── NewCRFDepth.py │ ├── __init__.py │ ├── newcrf_layers.py │ ├── newcrf_utils.py │ ├── swin_transformer.py │ └── uper_crf_head.py │ ├── test.py │ ├── train.py │ └── utils.py ├── PixelFormer ├── README.md ├── configs │ ├── arguments_eval_kittieigen.txt │ ├── arguments_eval_nyu.txt │ ├── arguments_train_kittieigen.txt │ └── arguments_train_nyu.txt ├── data_splits │ ├── eigen_test_files_with_gt.txt │ ├── eigen_train_files_with_gt.txt │ ├── kitti_depth_prediction_train.txt │ ├── kitti_official_test.txt │ ├── kitti_official_valid.txt │ ├── nyudepthv2_test_files_with_gt.txt │ ├── nyudepthv2_train_files_with_gt_dense.txt │ ├── split_files.txt │ └── test.txt └── pixelformer │ ├── dataloaders │ ├── __init__.py │ ├── dataloader.py │ └── dataloader_kittipred.py │ ├── demo.py │ ├── eval.py │ ├── load.py │ ├── networks │ ├── PQI.py │ ├── PixelFormer.py │ ├── SAM.py │ ├── __init__.py │ ├── swin_transformer.py │ └── utils.py │ ├── test.py │ ├── train.py │ └── utils.py ├── README.md ├── Unidepth ├── LICENSE ├── README.md ├── assets │ ├── demo │ │ ├── depth.png │ │ ├── intrinsics.npy │ │ ├── output.png │ │ └── rgb.png │ └── docs │ │ ├── nuscenes_surround.gif │ │ ├── theoffice.gif │ │ └── unidepth-banner.png ├── configs │ ├── config_v1_cnvnxtl.json │ └── config_v1_vitl14.json ├── demo.py ├── hubconf.py ├── install.sh ├── pyproject.toml ├── requirements.txt └── unidepth │ ├── __init__.py │ ├── layers │ ├── __init__.py │ ├── activation.py │ ├── attention.py │ ├── convnext.py │ ├── drop_path.py │ ├── layer_scale.py │ ├── mlp.py │ ├── nystrom_attention.py │ ├── positional_encoding.py │ └── upsample.py │ ├── models │ ├── __init__.py │ ├── backbones │ │ ├── __init__.py │ │ ├── convnext.py │ │ ├── convnext2.py │ │ ├── dinov2.py │ │ └── metadinov2 │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── dino_head.py │ │ │ ├── drop_path.py │ │ │ ├── layer_scale.py │ │ │ ├── mlp.py │ │ │ ├── patch_embed.py │ │ │ └── swiglu_ffn.py │ ├── encoder.py │ └── unidepthv1 │ │ ├── __init__.py │ │ ├── decoder.py │ │ └── unidepthv1.py │ ├── ops │ ├── __init__.py │ ├── losses.py │ └── scheduler.py │ └── utils │ ├── __init__.py │ ├── constants.py │ ├── distributed.py │ ├── ema_torch.py │ ├── evaluation_depth.py │ ├── geometric.py │ ├── misc.py │ ├── positional_embedding.py │ ├── sht.py │ └── visualization.py ├── VPD ├── LICENSE ├── README.md ├── depth │ ├── README.md │ ├── configs │ │ ├── base_options.py │ │ ├── test_options.py │ │ └── train_options.py │ ├── dataset │ │ ├── base_dataset.py │ │ ├── filenames │ │ │ └── nyudepthv2 │ │ │ │ ├── split_files.txt │ │ │ │ └── split_files_sml.txt │ │ ├── imagepath.py │ │ └── nyudepthv2.py │ ├── dump_nyu_text_embeddings.py │ ├── extract_official_train_test_set_from_mat.py │ ├── models_depth │ │ ├── checkpoint.py │ │ ├── model.py │ │ └── optimizer.py │ ├── nyu_class_embeddings.pth │ ├── nyu_class_list.json │ ├── requirements.txt │ ├── splits.mat │ ├── src │ │ └── clip │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── MANIFEST.in │ │ │ ├── README.md │ │ │ ├── clip │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip.py │ │ │ ├── model.py │ │ │ └── simple_tokenizer.py │ │ │ ├── data │ │ │ ├── country211.md │ │ │ ├── prompts.md │ │ │ ├── rendered-sst2.md │ │ │ └── yfcc100m.md │ │ │ ├── hubconf.py │ │ │ ├── model-card.md │ │ │ ├── notebooks │ │ │ ├── Interacting_with_CLIP.ipynb │ │ │ └── Prompt_Engineering_for_ImageNet.ipynb │ │ │ ├── requirements.txt │ │ │ ├── setup.py │ │ │ └── tests │ │ │ └── test_consistency.py │ ├── test.py │ ├── test.sh │ ├── train.py │ ├── train.sh │ ├── utils.py │ ├── utils_depth │ │ ├── criterion.py │ │ ├── logging.py │ │ └── metrics.py │ └── v1-inference.yaml ├── refer │ ├── README.md │ ├── args.py │ ├── data │ │ └── dataset_refer_clip.py │ ├── models_refer │ │ ├── __init__.py │ │ └── model.py │ ├── refer │ │ ├── LICENSE │ │ ├── Makefile │ │ ├── README.md │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ ├── bleu │ │ │ │ ├── LICENSE │ │ │ │ ├── __init__.py │ │ │ │ ├── bleu.py │ │ │ │ └── bleu_scorer.py │ │ │ ├── cider │ │ │ │ ├── __init__.py │ │ │ │ ├── cider.py │ │ │ │ └── cider_scorer.py │ │ │ ├── meteor │ │ │ │ ├── __init__.py │ │ │ │ └── meteor.py │ │ │ ├── readme.txt │ │ │ ├── refEvaluation.py │ │ │ ├── rouge │ │ │ │ ├── __init__.py │ │ │ │ └── rouge.py │ │ │ └── tokenizer │ │ │ │ ├── __init__.py │ │ │ │ ├── ptbtokenizer.py │ │ │ │ └── stanford-corenlp-3.4.1.jar │ │ ├── external │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── _mask.pyx │ │ │ ├── mask.py │ │ │ ├── maskApi.c │ │ │ └── maskApi.h │ │ ├── pyEvalDemo.ipynb │ │ ├── pyReferDemo.ipynb │ │ ├── refer.py │ │ ├── setup.py │ │ └── test │ │ │ ├── sample_expressions_testA.json │ │ │ └── sample_expressions_testB.json │ ├── requirements.txt │ ├── test.py │ ├── test.sh │ ├── train.py │ ├── train.sh │ ├── transforms.py │ ├── utils.py │ └── v1-inference.yaml ├── segmentation │ ├── README.md │ ├── class_embeddings.pth │ ├── configs │ │ ├── _base_ │ │ │ ├── datasets │ │ │ │ └── ade20k_vpd.py │ │ │ ├── default_runtime.py │ │ │ ├── models │ │ │ │ └── fpn_r50.py │ │ │ └── schedules │ │ │ │ ├── schedule_160k.py │ │ │ │ └── schedule_80k.py │ │ └── fpn_vpd_sd1-5_512x512_gpu8x2.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── models │ │ ├── __init__.py │ │ └── vpd_seg.py │ ├── test.py │ └── train.py ├── stable-diffusion │ ├── LICENSE │ ├── README.md │ ├── Stable_Diffusion_v1_Model_Card.md │ ├── configs │ │ ├── autoencoder │ │ │ ├── autoencoder_kl_16x16x16.yaml │ │ │ ├── autoencoder_kl_32x32x4.yaml │ │ │ ├── autoencoder_kl_64x64x3.yaml │ │ │ └── autoencoder_kl_8x8x64.yaml │ │ ├── latent-diffusion │ │ │ ├── celebahq-ldm-vq-4.yaml │ │ │ ├── cin-ldm-vq-f8.yaml │ │ │ ├── cin256-v2.yaml │ │ │ ├── ffhq-ldm-vq-4.yaml │ │ │ ├── lsun_bedrooms-ldm-vq-4.yaml │ │ │ ├── lsun_churches-ldm-kl-8.yaml │ │ │ └── txt2img-1p4B-eval.yaml │ │ ├── retrieval-augmented-diffusion │ │ │ └── 768x768.yaml │ │ └── stable-diffusion │ │ │ └── v1-inference.yaml │ ├── environment.yaml │ ├── ldm │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── imagenet.py │ │ │ └── lsun.py │ │ ├── lr_scheduler.py │ │ ├── models │ │ │ ├── autoencoder.py │ │ │ └── diffusion │ │ │ │ ├── __init__.py │ │ │ │ ├── classifier.py │ │ │ │ ├── ddim.py │ │ │ │ ├── ddpm.py │ │ │ │ ├── dpm_solver │ │ │ │ ├── __init__.py │ │ │ │ ├── dpm_solver.py │ │ │ │ └── sampler.py │ │ │ │ └── plms.py │ │ ├── modules │ │ │ ├── attention.py │ │ │ ├── diffusionmodules │ │ │ │ ├── __init__.py │ │ │ │ ├── model.py │ │ │ │ ├── openaimodel.py │ │ │ │ └── util.py │ │ │ ├── distributions │ │ │ │ ├── __init__.py │ │ │ │ └── distributions.py │ │ │ ├── ema.py │ │ │ ├── encoders │ │ │ │ ├── __init__.py │ │ │ │ └── modules.py │ │ │ ├── image_degradation │ │ │ │ ├── __init__.py │ │ │ │ ├── bsrgan.py │ │ │ │ ├── bsrgan_light.py │ │ │ │ ├── utils │ │ │ │ │ └── test.png │ │ │ │ └── utils_image.py │ │ │ ├── losses │ │ │ │ ├── __init__.py │ │ │ │ ├── contperceptual.py │ │ │ │ └── vqperceptual.py │ │ │ └── x_transformer.py │ │ └── util.py │ ├── main.py │ ├── models │ │ ├── first_stage_models │ │ │ ├── kl-f16 │ │ │ │ └── config.yaml │ │ │ ├── kl-f32 │ │ │ │ └── config.yaml │ │ │ ├── kl-f4 │ │ │ │ └── config.yaml │ │ │ ├── kl-f8 │ │ │ │ └── config.yaml │ │ │ ├── vq-f16 │ │ │ │ └── config.yaml │ │ │ ├── vq-f4-noattn │ │ │ │ └── config.yaml │ │ │ ├── vq-f4 │ │ │ │ └── config.yaml │ │ │ ├── vq-f8-n256 │ │ │ │ └── config.yaml │ │ │ └── vq-f8 │ │ │ │ └── config.yaml │ │ └── ldm │ │ │ ├── bsr_sr │ │ │ └── config.yaml │ │ │ ├── celeba256 │ │ │ └── config.yaml │ │ │ ├── cin256 │ │ │ └── config.yaml │ │ │ ├── ffhq256 │ │ │ └── config.yaml │ │ │ ├── inpainting_big │ │ │ └── config.yaml │ │ │ ├── layout2img-openimages256 │ │ │ └── config.yaml │ │ │ ├── lsun_beds256 │ │ │ └── config.yaml │ │ │ ├── lsun_churches256 │ │ │ └── config.yaml │ │ │ ├── semantic_synthesis256 │ │ │ └── config.yaml │ │ │ ├── semantic_synthesis512 │ │ │ └── config.yaml │ │ │ └── text2img256 │ │ │ └── config.yaml │ ├── notebook_helpers.py │ ├── scripts │ │ ├── download_first_stages.sh │ │ ├── download_models.sh │ │ ├── img2img.py │ │ ├── inpaint.py │ │ ├── knn2img.py │ │ ├── sample_diffusion.py │ │ ├── tests │ │ │ └── test_watermark.py │ │ ├── train_searcher.py │ │ └── txt2img.py │ ├── setup.py │ └── src │ │ └── taming-transformers │ │ ├── License.txt │ │ ├── README.md │ │ ├── configs │ │ ├── coco_cond_stage.yaml │ │ ├── coco_scene_images_transformer.yaml │ │ ├── custom_vqgan.yaml │ │ ├── drin_transformer.yaml │ │ ├── faceshq_transformer.yaml │ │ ├── faceshq_vqgan.yaml │ │ ├── imagenet_vqgan.yaml │ │ ├── imagenetdepth_vqgan.yaml │ │ ├── open_images_scene_images_transformer.yaml │ │ └── sflckr_cond_stage.yaml │ │ ├── environment.yaml │ │ ├── main.py │ │ ├── scripts │ │ ├── extract_depth.py │ │ ├── extract_segmentation.py │ │ ├── extract_submodel.py │ │ ├── make_samples.py │ │ ├── make_scene_samples.py │ │ ├── sample_conditional.py │ │ └── sample_fast.py │ │ ├── setup.py │ │ └── taming │ │ ├── data │ │ ├── ade20k.py │ │ ├── annotated_objects_coco.py │ │ ├── annotated_objects_dataset.py │ │ ├── annotated_objects_open_images.py │ │ ├── base.py │ │ ├── coco.py │ │ ├── conditional_builder │ │ │ ├── objects_bbox.py │ │ │ ├── objects_center_points.py │ │ │ └── utils.py │ │ ├── custom.py │ │ ├── faceshq.py │ │ ├── helper_types.py │ │ ├── image_transforms.py │ │ ├── imagenet.py │ │ ├── open_images_helper.py │ │ ├── sflckr.py │ │ └── utils.py │ │ ├── lr_scheduler.py │ │ ├── models │ │ ├── cond_transformer.py │ │ ├── dummy_cond_stage.py │ │ └── vqgan.py │ │ ├── modules │ │ ├── diffusionmodules │ │ │ └── model.py │ │ ├── discriminator │ │ │ └── model.py │ │ ├── losses │ │ │ ├── __init__.py │ │ │ ├── lpips.py │ │ │ ├── segmentation.py │ │ │ └── vqperceptual.py │ │ ├── misc │ │ │ └── coord.py │ │ ├── transformer │ │ │ ├── mingpt.py │ │ │ └── permuter.py │ │ ├── util.py │ │ └── vqvae │ │ │ └── quantize.py │ │ └── util.py └── vpd │ ├── __init__.py │ └── models.py ├── ZoeDepth ├── LICENSE ├── README.md ├── assets │ └── zoedepth-teaser.png ├── demo.py ├── demo_local.py ├── environment.yml ├── evaluate.py ├── hubconf.py ├── notebooks │ └── ZoeDepth_quickstart.ipynb ├── sanity.py ├── sanity_hub.py ├── split_files.txt ├── train_mix.py ├── train_mono.py ├── train_test_inputs │ ├── kitti_eigen_test_files_with_gt.txt │ ├── kitti_eigen_train_files_with_gt.txt │ ├── nyudepthv2_test_files_with_gt.txt │ └── nyudepthv2_train_files_with_gt.txt ├── ui │ ├── app.py │ ├── gradio_depth_pred.py │ ├── gradio_im_to_3d.py │ ├── gradio_pano_to_3d.py │ └── ui_requirements.txt └── zoedepth │ ├── data │ ├── __init__.py │ ├── data_mono.py │ ├── ddad.py │ ├── diml_indoor_test.py │ ├── diml_outdoor_test.py │ ├── diode.py │ ├── hypersim.py │ ├── ibims.py │ ├── preprocess.py │ ├── sun_rgbd_loader.py │ ├── transforms.py │ ├── vkitti.py │ └── vkitti2.py │ ├── models │ ├── __init__.py │ ├── base_models │ │ ├── __init__.py │ │ └── midas.py │ ├── builder.py │ ├── depth_model.py │ ├── layers │ │ ├── attractor.py │ │ ├── dist_layers.py │ │ ├── localbins_layers.py │ │ └── patch_transformer.py │ ├── model_io.py │ ├── zoedepth │ │ ├── __init__.py │ │ ├── config_zoedepth.json │ │ ├── config_zoedepth_kitti.json │ │ └── zoedepth_v1.py │ └── zoedepth_nk │ │ ├── __init__.py │ │ ├── config_zoedepth_nk.json │ │ └── zoedepth_nk_v1.py │ ├── trainers │ ├── base_trainer.py │ ├── builder.py │ ├── loss.py │ ├── zoedepth_nk_trainer.py │ └── zoedepth_trainer.py │ └── utils │ ├── __init__.py │ ├── arg_utils.py │ ├── config.py │ ├── easydict │ └── __init__.py │ ├── geometry.py │ └── misc.py ├── bts ├── README.md ├── pytorch │ ├── bts.py │ ├── bts_dataloader.py │ ├── bts_eval.py │ ├── bts_live_3d.py │ ├── bts_main.py │ ├── bts_test.py │ ├── distributed_sampler_no_evenly_divisible.py │ ├── run_bts_eval_schedule.py │ └── run_bts_live_3d.sh ├── train_test_inputs │ ├── eigen_test_files_with_gt.txt │ ├── eigen_train_files_with_gt.txt │ ├── nyudepthv2_test_files_with_gt.txt │ ├── nyudepthv2_train_files_with_gt.txt │ └── split_files.txt └── utils │ ├── download_from_gdrive.py │ ├── eval_with_pngs.py │ ├── extract_official_train_test_set_from_mat.py │ ├── kitti_archives_to_download.txt │ ├── nyudepthv2_archives_to_download.txt │ ├── splits.mat │ ├── sync_project_frames_multi_threads.m │ └── train_scenes.txt ├── pics ├── dataset-1.png ├── dataset-2.png ├── fitting.png ├── group.png ├── heirarchy.png ├── mitigation.png ├── overall.png └── type.png └── space_type_def.yml /Adabins/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Adabins: Depth Estimation using adaptive bins 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX 4 | 5 | 2. Download pretrained model 'AdaBins_nyu.pt' from [Official Link](https://drive.google.com/drive/folders/1nYyaQXOBjNdUJDsmJpcRpu6oE55aQoLA?usp=sharing) and put it under the folder 'pretrained' 6 | 7 | 3. 8 | 9 | ``` 10 | python demo.py -i ../InSpaceType 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 14 | -------------------------------------------------------------------------------- /Adabins/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .unet_adaptive_bins import UnetAdaptiveBins 2 | -------------------------------------------------------------------------------- /Adabins/models/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class PatchTransformerEncoder(nn.Module): 6 | def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4): 7 | super(PatchTransformerEncoder, self).__init__() 8 | encoder_layers = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward=1024) 9 | self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=4) # takes shape S,N,E 10 | 11 | self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim, 12 | kernel_size=patch_size, stride=patch_size, padding=0) 13 | 14 | self.positional_encodings = nn.Parameter(torch.rand(500, embedding_dim), requires_grad=True) 15 | 16 | def forward(self, x): 17 | embeddings = self.embedding_convPxP(x).flatten(2) # .shape = n,c,s = n, embedding_dim, s 18 | # embeddings = nn.functional.pad(embeddings, (1,0)) # extra special token at start ? 19 | embeddings = embeddings + self.positional_encodings[:embeddings.shape[2], :].T.unsqueeze(0) 20 | 21 | # change to S,N,E format required by transformer 22 | embeddings = embeddings.permute(2, 0, 1) 23 | x = self.transformer_encoder(embeddings) # .shape = S, N, E 24 | return x 25 | 26 | 27 | class PixelWiseDotProduct(nn.Module): 28 | def __init__(self): 29 | super(PixelWiseDotProduct, self).__init__() 30 | 31 | def forward(self, x, K): 32 | n, c, h, w = x.size() 33 | _, cout, ck = K.size() 34 | assert c == ck, "Number of channels in x and Embedding dimension (at dim 2) of K matrix must match" 35 | y = torch.matmul(x.view(n, c, h * w).permute(0, 2, 1), K.permute(0, 2, 1)) # .shape = n, hw, cout 36 | return y.permute(0, 2, 1).view(n, cout, h, w) 37 | -------------------------------------------------------------------------------- /DPT/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Vision Transformers for Dense Prediction 2 | 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt''' 4 | 5 | 2. Download pretrained model 'dpt_hybrid_nyu-2ce69ec7.pt' from [Official Link](https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid_nyu-2ce69ec7.pt) and put it under the folder 'weights' 6 | 7 | 3. 8 | 9 | ``` 10 | python run_monodepth.py -t dpt_hybrid_nyu -i ../InSpaceType 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 14 | -------------------------------------------------------------------------------- /DPT/dpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DPT/dpt/__init__.py -------------------------------------------------------------------------------- /DPT/dpt/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device("cpu")) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /DPT/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.8.1 2 | torchvision==0.9.1 3 | opencv-python==4.5.2.54 4 | timm==0.4.5 5 | -------------------------------------------------------------------------------- /DPT/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DPT/util/__init__.py -------------------------------------------------------------------------------- /Decompose/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Depth Map Decomposition for Monocular Depth Estimation 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, pandas, opencv-python, tqdm, efficientnet_pytorch 4 | 5 | 6 | 2. Download pretrained model '51k_HRWSI.pth' from [Official Link](https://drive.google.com/drive/folders/1zsgT_5AO89WxzlFI53gwjomisb_Gkcox?usp=sharing) and put it here. 7 | 8 | 3. 9 | 10 | ``` 11 | python demo.py --ckpt 51k_HRWSI.pth --filenames_file split_files.txt 12 | ``` 13 | 14 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 15 | -------------------------------------------------------------------------------- /DepthAnything/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Depth-Anything 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, opencv-python 4 | 5 | 2. Download Depth-Anything NYUv2 finetuned model (depth_anything_metric_depth_indoor.pt) from [official link](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints_metric_depth) and place it under 'metric_depth/checkpoints' 6 | 7 | 3. 8 | 9 | ``` 10 | cd metric_depth 11 | 12 | python demo.py --img-path ../../InSpaceType --outdir ./vis_depth 13 | ``` 14 | 15 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 16 | Colored visualization in metric depth are saved under --outdir 17 | -------------------------------------------------------------------------------- /DepthAnything/controlnet/README.md: -------------------------------------------------------------------------------- 1 | ## Depth-Conditioned ControlNet based on Depth Anything 2 | 3 | We use [Diffusers](https://github.com/huggingface/diffusers/tree/main) to re-train a better depth-conditioned ControlNet based on our Depth Anything. 4 | 5 | Please download our [config file](./config.json) and [pre-trained weights](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints_controlnet), then follow the [instructions](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) in Diffusers for inference. 6 | 7 | ## Depth-to-Image Synthesis 8 | 9 | ![demo2](../assets/controlnet_demo1.png) 10 | ![demo1](../assets/controlnet_demo2.png) 11 | 12 | 13 | ## Video Editing 14 | 15 | Please refer to our [project page](https://depth-anything.github.io/). We use [MagicEdit](https://github.com/magic-research/magic-edit) to show demos of video editing based on depth information. 16 | -------------------------------------------------------------------------------- /DepthAnything/controlnet/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "ControlNetModel", 3 | "_diffusers_version": "0.26.0.dev0", 4 | "act_fn": "silu", 5 | "addition_embed_type": null, 6 | "addition_embed_type_num_heads": 64, 7 | "addition_time_embed_dim": null, 8 | "attention_head_dim": 8, 9 | "block_out_channels": [ 10 | 320, 11 | 640, 12 | 1280, 13 | 1280 14 | ], 15 | "class_embed_type": null, 16 | "conditioning_channels": 3, 17 | "conditioning_embedding_out_channels": [ 18 | 16, 19 | 32, 20 | 96, 21 | 256 22 | ], 23 | "controlnet_conditioning_channel_order": "rgb", 24 | "cross_attention_dim": 768, 25 | "down_block_types": [ 26 | "CrossAttnDownBlock2D", 27 | "CrossAttnDownBlock2D", 28 | "CrossAttnDownBlock2D", 29 | "DownBlock2D" 30 | ], 31 | "downsample_padding": 1, 32 | "encoder_hid_dim": null, 33 | "encoder_hid_dim_type": null, 34 | "flip_sin_to_cos": true, 35 | "freq_shift": 0, 36 | "global_pool_conditions": false, 37 | "in_channels": 4, 38 | "layers_per_block": 2, 39 | "mid_block_scale_factor": 1, 40 | "mid_block_type": "UNetMidBlock2DCrossAttn", 41 | "norm_eps": 1e-05, 42 | "norm_num_groups": 32, 43 | "num_attention_heads": null, 44 | "num_class_embeds": null, 45 | "only_cross_attention": false, 46 | "projection_class_embeddings_input_dim": null, 47 | "resnet_time_scale_shift": "default", 48 | "transformer_layers_per_block": 1, 49 | "upcast_attention": false, 50 | "use_linear_projection": false 51 | } 52 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/checkpoints/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/DepthAnything/metric_depth/checkpoints/.placeholder -------------------------------------------------------------------------------- /DepthAnything/metric_depth/environment.yml: -------------------------------------------------------------------------------- 1 | name: zoe 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - conda-forge 6 | dependencies: 7 | - cuda=11.7.1 8 | - h5py=3.7.0 9 | - hdf5=1.12.2 10 | - matplotlib=3.6.2 11 | - matplotlib-base=3.6.2 12 | - numpy=1.24.1 13 | - opencv=4.6.0 14 | - pip=22.3.1 15 | - python=3.9.7 16 | - pytorch=1.13.1 17 | - pytorch-cuda=11.7 18 | - pytorch-mutex=1.0 19 | - scipy=1.10.0 20 | - torchaudio=0.13.1 21 | - torchvision=0.14.1 22 | - pip: 23 | - huggingface-hub==0.11.1 24 | - timm==0.6.12 25 | - tqdm==4.64.1 26 | - wandb==0.13.9 27 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/data/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/models/base_models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/models/zoedepth/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_v1 import ZoeDepth 26 | 27 | all_versions = { 28 | "v1": ZoeDepth, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/models/zoedepth/config_zoedepth_kitti.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "bin_centers_type": "normed", 4 | "img_size": [384, 768] 5 | }, 6 | 7 | "train": { 8 | }, 9 | 10 | "infer":{ 11 | "train_midas": false, 12 | "use_pretrained_midas": false, 13 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt", 14 | "force_keep_ar": true 15 | }, 16 | 17 | "eval":{ 18 | "train_midas": false, 19 | "use_pretrained_midas": false, 20 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" 21 | } 22 | } -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/models/zoedepth_nk/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_nk_v1 import ZoeDepthNK 26 | 27 | all_versions = { 28 | "v1": ZoeDepthNK, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /DepthAnything/metric_depth/zoedepth/utils/arg_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def infer_type(x): # hacky way to infer type from string args 4 | if not isinstance(x, str): 5 | return x 6 | 7 | try: 8 | x = int(x) 9 | return x 10 | except ValueError: 11 | pass 12 | 13 | try: 14 | x = float(x) 15 | return x 16 | except ValueError: 17 | pass 18 | 19 | return x 20 | 21 | 22 | def parse_unknown(unknown_args): 23 | clean = [] 24 | for a in unknown_args: 25 | if "=" in a: 26 | k, v = a.split("=") 27 | clean.extend([k, v]) 28 | else: 29 | clean.append(a) 30 | 31 | keys = clean[::2] 32 | values = clean[1::2] 33 | return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)} 34 | -------------------------------------------------------------------------------- /DepthAnything/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio_imageslider 2 | gradio==4.14.0 3 | torch 4 | torchvision 5 | opencv-python 6 | huggingface_hub -------------------------------------------------------------------------------- /DepthAnything/semseg/dinov2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmengine.model import BaseModule 3 | from torch import nn 4 | 5 | from mmseg.registry import MODELS 6 | 7 | 8 | 9 | @MODELS.register_module() 10 | class DINOv2(nn.Module): 11 | """Use DINOv2 pre-trained models 12 | """ 13 | 14 | def __init__(self, version='large', freeze=False, load_from=None): 15 | super().__init__() 16 | 17 | if version == 'large': 18 | self.dinov2 = torch.hub.load('torchhub/facebookresearch_dinov2_main', 'dinov2_vit14', source='local', pretrained=False) 19 | else: 20 | raise NotImplementedError 21 | 22 | if load_from is not None: 23 | d = torch.load(load_from, map_location='cpu') 24 | new_d = {} 25 | for key, value in d.items(): 26 | if 'pretrained' in key: 27 | new_d[key.replace('pretrained.', '')] = value 28 | self.dinov2.load_state_dict(new_d) 29 | 30 | self.freeze = freeze 31 | 32 | def forward(self, inputs): 33 | B, _, h, w = inputs.shape 34 | 35 | if self.freeze: 36 | with torch.no_grad(): 37 | features = self.dinov2.get_intermediate_layers(inputs, 4) 38 | else: 39 | features = self.dinov2.get_intermediate_layers(inputs, 4) 40 | 41 | outs = [] 42 | for feature in features: 43 | C = feature.shape[-1] 44 | feature = feature.permute(0, 2, 1).reshape(B, C, h // 14, w // 14).contiguous() 45 | outs.append(feature) 46 | 47 | return outs 48 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/README.md: -------------------------------------------------------------------------------- 1 | # Local PyTorch Hub 2 | 3 | This directory is for loading the DINOv2 encoder locally in case of no Internet connection. 4 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DINOv2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to DINOv2, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/conda.yaml: -------------------------------------------------------------------------------- 1 | name: dinov2 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | - xformers 7 | - conda-forge 8 | dependencies: 9 | - python=3.9 10 | - pytorch::pytorch=2.0.0 11 | - pytorch::pytorch-cuda=11.7.0 12 | - pytorch::torchvision=0.15.0 13 | - omegaconf 14 | - torchmetrics=0.10.3 15 | - fvcore 16 | - iopath 17 | - xformers::xformers=0.0.18 18 | - pip 19 | - pip: 20 | - git+https://github.com/facebookincubator/submitit 21 | - --extra-index-url https://pypi.nvidia.com 22 | - cuml-cu11 23 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | __version__ = "0.0.1" 8 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pathlib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | 12 | def load_config(config_name: str): 13 | config_filename = config_name + ".yaml" 14 | return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename) 15 | 16 | 17 | dinov2_default_config = load_config("ssl_default_config") 18 | 19 | 20 | def load_and_merge_config(config_name: str): 21 | default_config = OmegaConf.create(dinov2_default_config) 22 | loaded_config = load_config(config_name) 23 | return OmegaConf.merge(default_config, loaded_config) 24 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_base 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_giant2 3 | patch_size: 14 4 | ffn_layer: swiglufused 5 | crops: 6 | global_crops_size: 518 # this is to set up the position embeddings properly 7 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_large 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_small 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 12 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_giant2 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 32 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_large 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml: -------------------------------------------------------------------------------- 1 | # this corresponds to the default config 2 | train: 3 | dataset_path: ImageNet:split=TRAIN 4 | batch_size_per_gpu: 64 5 | student: 6 | block_chunks: 4 7 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .adapters import DatasetWithEnumeratedTargets 8 | from .loaders import make_data_loader, make_dataset, SamplerType 9 | from .collate import collate_data_and_cast 10 | from .masking import MaskingGenerator 11 | from .augmentations import DataAugmentationDINO 12 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any, Tuple 8 | 9 | from torch.utils.data import Dataset 10 | 11 | 12 | class DatasetWithEnumeratedTargets(Dataset): 13 | def __init__(self, dataset): 14 | self._dataset = dataset 15 | 16 | def get_image_data(self, index: int) -> bytes: 17 | return self._dataset.get_image_data(index) 18 | 19 | def get_target(self, index: int) -> Tuple[Any, int]: 20 | target = self._dataset.get_target(index) 21 | return (index, target) 22 | 23 | def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]: 24 | image, target = self._dataset[index] 25 | target = index if target is None else target 26 | return image, (index, target) 27 | 28 | def __len__(self) -> int: 29 | return len(self._dataset) 30 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .image_net import ImageNet 8 | from .image_net_22k import ImageNet22k 9 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from io import BytesIO 8 | from typing import Any 9 | 10 | from PIL import Image 11 | 12 | 13 | class Decoder: 14 | def decode(self) -> Any: 15 | raise NotImplementedError 16 | 17 | 18 | class ImageDataDecoder(Decoder): 19 | def __init__(self, image_data: bytes) -> None: 20 | self._image_data = image_data 21 | 22 | def decode(self) -> Image: 23 | f = BytesIO(self._image_data) 24 | return Image.open(f).convert(mode="RGB") 25 | 26 | 27 | class TargetDecoder(Decoder): 28 | def __init__(self, target: Any): 29 | self._target = target 30 | 31 | def decode(self) -> Any: 32 | return self._target 33 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any, Tuple 8 | 9 | from torchvision.datasets import VisionDataset 10 | 11 | from .decoders import TargetDecoder, ImageDataDecoder 12 | 13 | 14 | class ExtendedVisionDataset(VisionDataset): 15 | def __init__(self, *args, **kwargs) -> None: 16 | super().__init__(*args, **kwargs) # type: ignore 17 | 18 | def get_image_data(self, index: int) -> bytes: 19 | raise NotImplementedError 20 | 21 | def get_target(self, index: int) -> Any: 22 | raise NotImplementedError 23 | 24 | def __getitem__(self, index: int) -> Tuple[Any, Any]: 25 | try: 26 | image_data = self.get_image_data(index) 27 | image = ImageDataDecoder(image_data).decode() 28 | except Exception as e: 29 | raise RuntimeError(f"can not read image for sample {index}") from e 30 | target = self.get_target(index) 31 | target = TargetDecoder(target).decode() 32 | 33 | if self.transforms is not None: 34 | image, target = self.transforms(image, target) 35 | 36 | return image, target 37 | 38 | def __len__(self) -> int: 39 | raise NotImplementedError 40 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .dino_head import DINOHead 8 | from .mlp import Mlp 9 | from .patch_embed import PatchEmbed 10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 11 | from .block import NestedTensorBlock 12 | from .attention import MemEffAttention 13 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .dino_clstoken_loss import DINOLoss 8 | from .ibot_patch_loss import iBOTPatchLoss 9 | from .koleo_loss import KoLeoLoss 10 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | # import torch.distributed as dist 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class KoLeoLoss(nn.Module): 20 | """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search""" 21 | 22 | def __init__(self): 23 | super().__init__() 24 | self.pdist = nn.PairwiseDistance(2, eps=1e-8) 25 | 26 | def pairwise_NNs_inner(self, x): 27 | """ 28 | Pairwise nearest neighbors for L2-normalized vectors. 29 | Uses Torch rather than Faiss to remain on GPU. 30 | """ 31 | # parwise dot products (= inverse distance) 32 | dots = torch.mm(x, x.t()) 33 | n = x.shape[0] 34 | dots.view(-1)[:: (n + 1)].fill_(-1) # Trick to fill diagonal with -1 35 | # max inner prod -> min distance 36 | _, I = torch.max(dots, dim=1) # noqa: E741 37 | return I 38 | 39 | def forward(self, student_output, eps=1e-8): 40 | """ 41 | Args: 42 | student_output (BxD): backbone output of student 43 | """ 44 | with torch.cuda.amp.autocast(enabled=False): 45 | student_output = F.normalize(student_output, eps=eps, p=2, dim=-1) 46 | I = self.pairwise_NNs_inner(student_output) # noqa: E741 47 | distances = self.pdist(student_output, student_output[I]) # BxD, BxD -> B 48 | loss = -torch.log(distances + eps).mean() 49 | return loss 50 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | from . import vision_transformer as vits 10 | 11 | 12 | logger = logging.getLogger("dinov2") 13 | 14 | 15 | def build_model(args, only_teacher=False, img_size=224): 16 | args.arch = args.arch.removesuffix("_memeff") 17 | if "vit" in args.arch: 18 | vit_kwargs = dict( 19 | img_size=img_size, 20 | patch_size=args.patch_size, 21 | init_values=args.layerscale, 22 | ffn_layer=args.ffn_layer, 23 | block_chunks=args.block_chunks, 24 | qkv_bias=args.qkv_bias, 25 | proj_bias=args.proj_bias, 26 | ffn_bias=args.ffn_bias, 27 | ) 28 | teacher = vits.__dict__[args.arch](**vit_kwargs) 29 | if only_teacher: 30 | return teacher, teacher.embed_dim 31 | student = vits.__dict__[args.arch]( 32 | **vit_kwargs, 33 | drop_path_rate=args.drop_path_rate, 34 | drop_path_uniform=args.drop_path_uniform, 35 | ) 36 | embed_dim = student.embed_dim 37 | return student, teacher, embed_dim 38 | 39 | 40 | def build_model_from_cfg(cfg, only_teacher=False): 41 | return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size) 42 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser 12 | from dinov2.logging import setup_logging 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Evaluator: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.eval.knn import main as knn_main 25 | 26 | self._setup_args() 27 | knn_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 k-NN evaluation" 47 | knn_args_parser = get_knn_args_parser(add_help=False) 48 | parents = [knn_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Evaluator, args, name="dinov2:knn") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser 12 | from dinov2.logging import setup_logging 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Evaluator: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.eval.linear import main as linear_main 25 | 26 | self._setup_args() 27 | linear_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 linear evaluation" 47 | linear_args_parser = get_linear_args_parser(add_help=False) 48 | parents = [linear_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Evaluator, args, name="dinov2:linear") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.logging import setup_logging 12 | from dinov2.train import get_args_parser as get_train_args_parser 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Trainer(object): 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.train import main as train_main 25 | 26 | self._setup_args() 27 | train_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 training" 47 | train_args_parser = get_train_args_parser(add_help=False) 48 | parents = [train_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Trainer, args, name="dinov2:train") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .train import get_args_parser, main 8 | from .ssl_meta_arch import SSLMetaArch 9 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from typing import Dict, Union 9 | 10 | import numpy as np 11 | import torch 12 | 13 | 14 | TypeSpec = Union[str, np.dtype, torch.dtype] 15 | 16 | 17 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = { 18 | np.dtype("bool"): torch.bool, 19 | np.dtype("uint8"): torch.uint8, 20 | np.dtype("int8"): torch.int8, 21 | np.dtype("int16"): torch.int16, 22 | np.dtype("int32"): torch.int32, 23 | np.dtype("int64"): torch.int64, 24 | np.dtype("float16"): torch.float16, 25 | np.dtype("float32"): torch.float32, 26 | np.dtype("float64"): torch.float64, 27 | np.dtype("complex64"): torch.complex64, 28 | np.dtype("complex128"): torch.complex128, 29 | } 30 | 31 | 32 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype: 33 | if isinstance(dtype, torch.dtype): 34 | return dtype 35 | if isinstance(dtype, str): 36 | dtype = np.dtype(dtype) 37 | assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}" 38 | return _NUMPY_TO_TORCH_DTYPE[dtype] 39 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | 4 | [tool.pylint.master] 5 | persistent = false 6 | score = false 7 | 8 | [tool.pylint.messages_control] 9 | disable = "all" 10 | enable = [ 11 | "miscellaneous", 12 | "similarities", 13 | ] 14 | 15 | [tool.pylint.similarities] 16 | ignore-comments = true 17 | ignore-docstrings = true 18 | ignore-imports = true 19 | min-similarity-lines = 8 20 | 21 | [tool.pylint.reports] 22 | reports = false 23 | 24 | [tool.pylint.miscellaneous] 25 | notes = [ 26 | "FIXME", 27 | "XXX", 28 | "TODO", 29 | ] 30 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black==22.6.0 2 | flake8==5.0.4 3 | pylint==2.15.0 4 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu117 2 | torch==2.0.0 3 | torchvision==0.15.0 4 | omegaconf 5 | torchmetrics==0.10.3 6 | fvcore 7 | iopath 8 | xformers==0.0.18 9 | submitit 10 | --extra-index-url https://pypi.nvidia.com 11 | cuml-cu11 12 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -n "$1" ]; then 4 | echo "linting \"$1\"" 5 | fi 6 | 7 | echo "running black" 8 | if [ -n "$1" ]; then 9 | black "$1" 10 | else 11 | black dinov2 12 | fi 13 | 14 | echo "running flake8" 15 | if [ -n "$1" ]; then 16 | flake8 "$1" 17 | else 18 | flake8 19 | fi 20 | 21 | echo "running pylint" 22 | if [ -n "$1" ]; then 23 | pylint "$1" 24 | else 25 | pylint dinov2 26 | fi 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203,E501,W503 4 | per-file-ignores = 5 | __init__.py:F401 6 | exclude = 7 | venv 8 | -------------------------------------------------------------------------------- /DepthAnything/torchhub/facebookresearch_dinov2_main/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import itertools 7 | import math 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | 14 | _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2" 15 | 16 | 17 | def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str: 18 | compact_arch_name = arch_name.replace("_", "")[:4] 19 | registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else "" 20 | return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}" 21 | 22 | 23 | class CenterPadding(nn.Module): 24 | def __init__(self, multiple): 25 | super().__init__() 26 | self.multiple = multiple 27 | 28 | def _get_pad(self, size): 29 | new_size = math.ceil(size / self.multiple) * self.multiple 30 | pad_size = new_size - size 31 | pad_size_left = pad_size // 2 32 | pad_size_right = pad_size - pad_size_left 33 | return pad_size_left, pad_size_right 34 | 35 | @torch.inference_mode() 36 | def forward(self, x): 37 | pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1])) 38 | output = F.pad(x, pads) 39 | return output 40 | -------------------------------------------------------------------------------- /DistDepth/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Toward Practical Monocular Indoor Depth Estimation 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX 4 | 5 | 2. Download pretrained model from [Official Link](https://drive.google.com/file/d/1kLJBuMOf0xSpYq7DtxnPpBTxMwW0ylGm/view?usp=sharing) and extract under 'ckpts-finetuned'. Speficially, ckpts-finetuned should contain encoder.pth and decoder.pth 6 | 7 | 3. 8 | 9 | ``` 10 | python demo.py 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 14 | 15 | -------------------------------------------------------------------------------- /DistDepth/networks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .resnet_encoder import ResnetEncoder, ResnetEncoderMatching 8 | from .depth_decoder import DepthDecoder 9 | from .pose_decoder import PoseDecoder -------------------------------------------------------------------------------- /GLPDepth/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, pandas, transformers, opencv-python, tqdm 4 | 5 | 2. 6 | 7 | ``` 8 | python demo_glpn.py -i ../InSpaceType 9 | ``` 10 | 11 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 12 | -------------------------------------------------------------------------------- /IronDepth/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Iterative Refinement of Single-View Depth using Surface Normal and its Uncertainty 2 | 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt''' 4 | 5 | 2. Go to this [Official Link](https://drive.google.com/drive/folders/1idIVqOrJOK6kuidBng1K8sth-CyOfcCj?usp=sharing), and 6 | 7 | * Download `*.pt` and place them under `./checkpoints`. Specifically the 'checkpoints' folder should include irondepth_* and normal_* four checkpoints 8 | 9 | 10 | 3. 11 | 12 | ``` 13 | python test.py --train_data nyuv2 --test_data custom 14 | ``` 15 | 16 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. -------------------------------------------------------------------------------- /IronDepth/models_normal/NNET.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from models_normal.submodules import EESNU 6 | 7 | 8 | class NNET(nn.Module): 9 | def __init__(self, args): 10 | super(NNET, self).__init__() 11 | self.min_kappa = 0.01 12 | self.output_dim = 1 13 | self.output_type = 'G' 14 | 15 | if args.NNET_architecture == 'BN': 16 | self.n_net = EESNU(BN=True) 17 | else: 18 | self.n_net = EESNU(BN=False) 19 | 20 | def forward(self, img, **kwargs): 21 | return self.n_net(img, **kwargs) 22 | 23 | -------------------------------------------------------------------------------- /IronDepth/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.6.0 2 | torchvision==0.7.0 3 | Pillow 4 | numpy 5 | matplotlib 6 | argparse 7 | tqdm -------------------------------------------------------------------------------- /MIM/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Revealing the Dark Secrets of Masked Image Modeling (Depth Estimation) 2 | 3 | 1. Download InSpaceType eval set. Install the requiremens.txt by '''pip install -r requirements.txt''' 4 | 5 | 2. Download pretrained model 'nyudepthv2_swin_large.ckpt' from [Official Link](https://mailustceducn-my.sharepoint.com/:f:/g/personal/aa397601_mail_ustc_edu_cn/EkoYQyhiD6hJu9CGYLOwiF8BRqHgk8kX61NUcyfmdOUV7Q?e=h2uctw) and put it under the folder 'ckpt' 6 | 7 | 3. 8 | 9 | ``` 10 | python test.py --dataset nyudepthv2 --data_path ../data/ --max_depth 10.0 --max_depth_eval 10.0 --backbone swin_large_v2 --depths 2 2 18 2 --num_filters 32 32 32 --deconv_kernels 2 2 2 --window_size 30 30 30 15 --pretrain_window_size 12 12 12 6 --use_shift True True False False --flip_test --shift_window_test --shift_size 2 --do_evaluate --ckpt_dir ckpt/nyudepthv2_swin_large.ckpt 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. -------------------------------------------------------------------------------- /MIM/configs/test_options.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | from configs.base_options import BaseOptions 7 | 8 | class TestOptions(BaseOptions): 9 | def initialize(self): 10 | parser = BaseOptions.initialize(self) 11 | parser.add_argument('--result_dir', type=str, default='./results', 12 | help='save result images into result_dir/exp_name') 13 | parser.add_argument('--ckpt_dir', type=str, 14 | default='./ckpt/best_model_nyu.ckpt', 15 | help='load ckpt path') 16 | 17 | parser.add_argument('--save_eval_pngs', action='store_true', 18 | help='save result image into evaluation form') 19 | parser.add_argument('--save_visualize', action='store_true', 20 | help='save result image into visulized form') 21 | parser.add_argument('--do_evaluate', action='store_true', 22 | help='evaluate with inferenced images') 23 | 24 | return parser 25 | 26 | 27 | -------------------------------------------------------------------------------- /MIM/dataset/imagepath.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | import os 7 | import cv2 8 | from torch.utils.data import Dataset 9 | import torchvision.transforms as transforms 10 | 11 | class imagepath(Dataset): 12 | # for test only 13 | def __init__(self, data_path): 14 | super().__init__() 15 | 16 | self.data_path = data_path 17 | self.to_tensor = transforms.ToTensor() 18 | 19 | self.filenames_list = [os.path.join(data_path, i) for i in os.listdir(data_path) 20 | if i.split('.')[-1] in ['jpg', 'png']] 21 | 22 | print("Dataset : Image Path") 23 | print("# of images: %d" % (len(self.filenames_list))) 24 | 25 | def __len__(self): 26 | return len(self.filenames_list) 27 | 28 | def __getitem__(self, idx): 29 | batch = {} 30 | file = self.filenames_list[idx] 31 | filename = file.split('/')[-1] 32 | 33 | image = cv2.imread(file) # [H x W x C] and C: BGR 34 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 35 | 36 | # input size should be multiple of 32 37 | h, w, c = image.shape 38 | new_h, new_w = h // 32 * 32, w // 32 * 32 39 | image = cv2.resize(image, (new_w, new_h)) 40 | image = self.to_tensor(image) 41 | 42 | batch['image'] = image 43 | batch['filename'] = filename 44 | 45 | return batch 46 | -------------------------------------------------------------------------------- /MIM/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.6.0 2 | h5py>=3.6.0 3 | scipy>=1.7.3 4 | opencv-python>=4.5.5 5 | mmcv>=1.4.3 6 | timm>=0.5.4 7 | albumentations>=1.1.0 8 | tensorboardX>=2.4.1 9 | gdown>=4.2.1 -------------------------------------------------------------------------------- /MIM/utils/criterion.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class SiLogLoss(nn.Module): 11 | def __init__(self, lambd=0.5): 12 | super().__init__() 13 | self.lambd = lambd 14 | 15 | def forward(self, pred, target): 16 | valid_mask = (target > 0).detach() 17 | diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask]) 18 | loss = torch.sqrt(torch.pow(diff_log, 2).mean() - 19 | self.lambd * torch.pow(diff_log.mean(), 2)) 20 | 21 | return loss 22 | 23 | -------------------------------------------------------------------------------- /NeWCRFs/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper NeW CRFs: Neural Window Fully-connected CRFs for Monocular Depth Estimation 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python 4 | 5 | 2. Download pretrained model 'model_nyu.ckpt' from [Official Link](https://virutalbuy-public.oss-cn-hangzhou.aliyuncs.com/share/newcrfs/models/model_nyu.ckpt) and put it here 6 | 7 | 3. 8 | 9 | ``` 10 | python newcrfs/test.py --data_path ./ --dataset nyu --filenames_file data_splits/split_files.txt --checkpoint_path model_nyu.ckpt --max_depth 10 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 14 | -------------------------------------------------------------------------------- /NeWCRFs/configs/arguments_eval_kittieigen.txt: -------------------------------------------------------------------------------- 1 | --model_name newcrfs_kittieigen 2 | --encoder large07 3 | --dataset kitti 4 | --input_height 352 5 | --input_width 1216 6 | --max_depth 80 7 | --do_kb_crop 8 | 9 | --data_path_eval datasets/kitti/ 10 | --gt_path_eval datasets/kitti/ 11 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt 12 | --min_depth_eval 1e-3 13 | --max_depth_eval 80 14 | --garg_crop 15 | 16 | --checkpoint_path model_zoo/model_kittieigen.ckpt -------------------------------------------------------------------------------- /NeWCRFs/configs/arguments_eval_nyu.txt: -------------------------------------------------------------------------------- 1 | --model_name newscrf_nyu 2 | --encoder large07 3 | --dataset nyu 4 | --input_height 480 5 | --input_width 640 6 | --max_depth 10 7 | 8 | --data_path_eval datasets/nyu/official_splits/test/ 9 | --gt_path_eval datasets/nyu/official_splits/test/ 10 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt 11 | --min_depth_eval 1e-3 12 | --max_depth_eval 10 13 | --eigen_crop 14 | 15 | --checkpoint_path model_zoo/model_nyu.ckpt -------------------------------------------------------------------------------- /NeWCRFs/configs/arguments_train_kittieigen.txt: -------------------------------------------------------------------------------- 1 | --mode train 2 | --model_name newcrfs_kittieigen 3 | --encoder large07 4 | --pretrain model_zoo/swin_transformer/swin_large_patch4_window7_224_22k.pth 5 | --dataset kitti 6 | --data_path datasets/kitti/ 7 | --gt_path datasets/kitti/ 8 | --filenames_file data_splits/eigen_train_files_with_gt.txt 9 | --batch_size 8 10 | --num_epochs 50 11 | --learning_rate 2e-5 12 | --weight_decay 1e-2 13 | --adam_eps 1e-3 14 | --num_threads 1 15 | --input_height 352 16 | --input_width 1120 17 | --max_depth 80 18 | --do_kb_crop 19 | --do_random_rotate 20 | --degree 1.0 21 | --log_directory ./models/ 22 | --multiprocessing_distributed 23 | --dist_url tcp://127.0.0.1:2345 24 | 25 | --log_freq 100 26 | --do_online_eval 27 | --eval_freq 1000 28 | --data_path_eval datasets/kitti/ 29 | --gt_path_eval datasets/kitti/ 30 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt 31 | --min_depth_eval 1e-3 32 | --max_depth_eval 80 33 | --garg_crop 34 | -------------------------------------------------------------------------------- /NeWCRFs/configs/arguments_train_nyu.txt: -------------------------------------------------------------------------------- 1 | --mode train 2 | --model_name newcrfs_nyu 3 | --encoder large07 4 | --pretrain model_zoo/swin_transformer/swin_large_patch4_window7_224_22k.pth 5 | --dataset nyu 6 | --data_path datasets/nyu/sync/ 7 | --gt_path datasets/nyu/sync/ 8 | --filenames_file data_splits/nyudepthv2_train_files_with_gt_dense.txt 9 | --batch_size 8 10 | --num_epochs 50 11 | --learning_rate 2e-5 12 | --weight_decay 1e-2 13 | --adam_eps 1e-3 14 | --num_threads 1 15 | --input_height 480 16 | --input_width 640 17 | --max_depth 10 18 | --do_random_rotate 19 | --degree 2.5 20 | --log_directory ./models/ 21 | --multiprocessing_distributed 22 | --dist_url tcp://127.0.0.1:2345 23 | 24 | --log_freq 100 25 | --do_online_eval 26 | --eval_freq 1000 27 | --data_path_eval datasets/nyu/official_splits/test/ 28 | --gt_path_eval datasets/nyu/official_splits/test/ 29 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt 30 | --min_depth_eval 1e-3 31 | --max_depth_eval 10 32 | --eigen_crop 33 | -------------------------------------------------------------------------------- /NeWCRFs/data_splits/test.txt: -------------------------------------------------------------------------------- 1 | files/0007_L.jpg -------------------------------------------------------------------------------- /NeWCRFs/newcrfs/dataloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/NeWCRFs/newcrfs/dataloaders/__init__.py -------------------------------------------------------------------------------- /NeWCRFs/newcrfs/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/NeWCRFs/newcrfs/networks/__init__.py -------------------------------------------------------------------------------- /PixelFormer/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper PixelFormer: Attention Attention Everywhere: Monocular Depth Prediction with Skip Attention 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python 4 | 5 | 2. Download pretrained model 'nyu.pt' from [Official Link](https://drive.google.com/drive/folders/1Feo67jEbccqa-HojTHG7ljTXOW2yuX-X?usp=share_link) and put it here 6 | 7 | 3. 8 | 9 | ``` 10 | python pixelformer/test.py --data_path ./ --dataset nyu --filenames_file data_splits/split_files.txt --checkpoint_path nyu.pth --max_depth 10 11 | ``` 12 | 13 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. -------------------------------------------------------------------------------- /PixelFormer/configs/arguments_eval_kittieigen.txt: -------------------------------------------------------------------------------- 1 | --model_name pixelformer_kittieigen 2 | --encoder large07 3 | --dataset kitti 4 | --input_height 352 5 | --input_width 1216 6 | --max_depth 80 7 | --do_kb_crop 8 | 9 | --data_path_eval dataset/KITTI 10 | --gt_path_eval dataset/kitti_gt 11 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt 12 | --min_depth_eval 1e-3 13 | --max_depth_eval 80 14 | --garg_crop 15 | 16 | --checkpoint_path pretrained/kitti.pth -------------------------------------------------------------------------------- /PixelFormer/configs/arguments_eval_nyu.txt: -------------------------------------------------------------------------------- 1 | --model_name pixelformer_nyu 2 | --encoder large07 3 | --dataset nyu 4 | --input_height 480 5 | --input_width 640 6 | --max_depth 10 7 | 8 | --data_path_eval datasets/nyu_depth_v2/official_splits/test/ 9 | --gt_path_eval datasets/nyu_depth_v2/official_splits/test/ 10 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt 11 | --min_depth_eval 1e-3 12 | --max_depth_eval 10 13 | --eigen_crop 14 | 15 | --checkpoint_path pretrained/nyu.pth 16 | -------------------------------------------------------------------------------- /PixelFormer/configs/arguments_train_kittieigen.txt: -------------------------------------------------------------------------------- 1 | --mode train 2 | --model_name pixelformer_kittieigen 3 | --encoder large07 4 | --pretrain pretrained/swin_large_patch4_window7_224_22k.pth 5 | --dataset kitti 6 | --data_path dataset/KITTI 7 | --gt_path dataset/kitti_gt 8 | --filenames_file data_splits/eigen_train_files_with_gt.txt 9 | --batch_size 8 10 | --num_epochs 20 11 | --learning_rate 4e-5 12 | --weight_decay 1e-2 13 | --adam_eps 1e-3 14 | --num_threads 1 15 | --input_height 352 16 | --input_width 1120 17 | --max_depth 80 18 | --do_kb_crop 19 | --do_random_rotate 20 | --degree 1.0 21 | --log_directory ./models/ 22 | --multiprocessing_distributed 23 | --dist_url tcp://127.0.0.1:1234 24 | 25 | --log_freq 100 26 | --do_online_eval 27 | --eval_freq 1000 28 | --data_path_eval dataset/KITTI 29 | --gt_path_eval kitti_gt 30 | --filenames_file_eval data_splits/eigen_test_files_with_gt.txt 31 | --min_depth_eval 1e-3 32 | --max_depth_eval 80 33 | --garg_crop 34 | -------------------------------------------------------------------------------- /PixelFormer/configs/arguments_train_nyu.txt: -------------------------------------------------------------------------------- 1 | --mode train 2 | --model_name pixelformer_nyu 3 | --encoder large07 4 | --pretrain pretrained/swin_large_patch4_window7_224_22k.pth 5 | --dataset nyu 6 | --data_path datasets/nyu_depth_v2/sync/ 7 | --gt_path datasets/nyu_depth_v2/sync/ 8 | --filenames_file data_splits/nyudepthv2_train_files_with_gt_dense.txt 9 | --batch_size 8 10 | --num_epochs 20 11 | --learning_rate 4e-5 12 | --weight_decay 1e-2 13 | --adam_eps 1e-3 14 | --num_threads 1 15 | --input_height 480 16 | --input_width 640 17 | --max_depth 10 18 | --do_random_rotate 19 | --degree 2.5 20 | --log_directory ./models/ 21 | --multiprocessing_distributed 22 | --dist_url tcp://127.0.0.1:2349 23 | 24 | --log_freq 100 25 | --do_online_eval 26 | --eval_freq 1000 27 | --data_path_eval datasets/nyu_depth_v2/official_splits/test/ 28 | --gt_path_eval datasets/nyu_depth_v2/official_splits/test/ 29 | --filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt 30 | --min_depth_eval 1e-3 31 | --max_depth_eval 10 32 | --eigen_crop 33 | -------------------------------------------------------------------------------- /PixelFormer/data_splits/test.txt: -------------------------------------------------------------------------------- 1 | files/0007_L.jpg -------------------------------------------------------------------------------- /PixelFormer/pixelformer/dataloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/PixelFormer/pixelformer/dataloaders/__init__.py -------------------------------------------------------------------------------- /PixelFormer/pixelformer/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/PixelFormer/pixelformer/networks/__init__.py -------------------------------------------------------------------------------- /Unidepth/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper Unidepth 2 | 3 | 1. Download InSpaceType eval set and put data under 'InspaceType' under the root. 4 | 5 | InSpaceType_Benchmark 6 | | - InSpaceType 7 | |- 0001.pfm 8 | |- 0001_L.jpg 9 | .... 10 | | - Method 1 11 | | - Method 2 12 | ...... 13 | 14 | Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, opencv-python, xFormers 15 | 16 | 2. 17 | 18 | ``` 19 | python demo.py --img-path ../InSpaceType --outdir ./vis_depth 20 | ``` 21 | 22 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 23 | Colored visualization in metric depth are saved under --outdir -------------------------------------------------------------------------------- /Unidepth/assets/demo/depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/depth.png -------------------------------------------------------------------------------- /Unidepth/assets/demo/intrinsics.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/intrinsics.npy -------------------------------------------------------------------------------- /Unidepth/assets/demo/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/output.png -------------------------------------------------------------------------------- /Unidepth/assets/demo/rgb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/demo/rgb.png -------------------------------------------------------------------------------- /Unidepth/assets/docs/nuscenes_surround.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/nuscenes_surround.gif -------------------------------------------------------------------------------- /Unidepth/assets/docs/theoffice.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/theoffice.gif -------------------------------------------------------------------------------- /Unidepth/assets/docs/unidepth-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/assets/docs/unidepth-banner.png -------------------------------------------------------------------------------- /Unidepth/configs/config_v1_cnvnxtl.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13 4 | }, 5 | "training": { 6 | }, 7 | "data": { 8 | "image_shape": [462, 616] 9 | }, 10 | "model": { 11 | "name": "UniDepthV1", 12 | "num_heads": 8, 13 | "expansion": 4, 14 | "pixel_decoder": { 15 | "hidden_dim": 512, 16 | "depths": [3, 2, 1], 17 | "dropout": 0.0 18 | }, 19 | "pixel_encoder": { 20 | "name": "convnext_large", 21 | "pretrained": null 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /Unidepth/configs/config_v1_vitl14.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": { 3 | "seed": 13 4 | }, 5 | "training": {}, 6 | "data": { 7 | "image_shape": [462, 616] 8 | }, 9 | "model": { 10 | "name": "UniDepthV1", 11 | "num_heads": 8, 12 | "expansion": 4, 13 | "pixel_decoder": { 14 | "hidden_dim": 512, 15 | "depths": [3, 2, 1], 16 | "dropout": 0.0 17 | }, 18 | "pixel_encoder": { 19 | "name": "dinov2_vitl14", 20 | "pretrained": null 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /Unidepth/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NAME=${1} 3 | VENV_DIR=${2} 4 | 5 | python -m venv ${VENV_DIR}/${NAME} 6 | 7 | source ${VENV_DIR}/${NAME}/bin/activate 8 | 9 | pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118 10 | pip install -e . 11 | pip install xformers==0.0.24 --index-url https://download.pytorch.org/whl/cu118 12 | -------------------------------------------------------------------------------- /Unidepth/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.pyright] 6 | include = ["unidepth"] 7 | 8 | [project] 9 | name = "unidepth" 10 | version = "0.1" 11 | authors = [{name = "Luigi Piccinelli", email = "lpiccinelli@ethz.ch"}] 12 | description = "UniDepth: Universal Monocular Metric Depth Estimation" 13 | readme = "README.md" 14 | license = { text="Creatives Common BY-NC 4.0 license"} 15 | requires-python = ">=3.10.0" 16 | dynamic = ["dependencies"] 17 | 18 | [tool.setuptools.dynamic] 19 | dependencies = {file = ["requirements.txt"]} 20 | 21 | [tool.setuptools.package-data] 22 | "*" = ["py.typed"] 23 | 24 | [tool.setuptools.packages.find] 25 | include = ["unidepth*"] 26 | -------------------------------------------------------------------------------- /Unidepth/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs 2 | attrs 3 | black 4 | blosc2 5 | botocore==1.34.54 6 | certifi==2022.12.7 7 | charset-normalizer 8 | click 9 | contourpy 10 | cycler 11 | docker-pycreds 12 | einops==0.7.0 13 | filelock 14 | flake8==7.0.0 15 | flake8-bugbear==24.2.6 16 | flake8-comprehensions==3.14.0 17 | fonttools 18 | fsspec 19 | fvcore==0.1.5.post20221221 20 | gitdb 21 | GitPython 22 | h5py>=3.10.0 23 | huggingface-hub>=0.22.0 24 | idna 25 | imageio 26 | imath 27 | iopath 28 | isort 29 | Jinja2 30 | jmespath 31 | kiwisolver 32 | MarkupSafe 33 | matplotlib 34 | mccabe 35 | mpmath 36 | msgpack 37 | mypy-extensions 38 | ndindex 39 | networkx 40 | ninja 41 | numexpr 42 | numpy 43 | opencv-python 44 | OpenEXR 45 | packaging 46 | pandas 47 | pathspec 48 | pillow==10.2.0 49 | platformdirs 50 | portalocker 51 | protobuf==4.25.3 52 | psutil 53 | py-cpuinfo 54 | pycodestyle 55 | pyflakes 56 | pyparsing 57 | python-dateutil 58 | pytz 59 | PyYAML 60 | requests 61 | safetensors 62 | scipy 63 | sentry-sdk 64 | setproctitle 65 | six 66 | smmap 67 | sympy 68 | tables 69 | tabulate 70 | termcolor 71 | timm 72 | tqdm 73 | triton==2.2.0 74 | typing_extensions 75 | tzdata==2024.1 76 | urllib3==1.26.13 77 | wandb 78 | yacs 79 | torch==2.2.0 80 | torchvision==0.17.0 81 | torchaudio==2.2.0 82 | xformers==0.0.24 -------------------------------------------------------------------------------- /Unidepth/unidepth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/Unidepth/unidepth/__init__.py -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .activation import SwiGLU, GEGLU 2 | from .convnext import CvnxtBlock 3 | from .attention import AttentionBlock, AttentionDecoderBlock 4 | from .nystrom_attention import NystromBlock 5 | from .positional_encoding import PositionEmbeddingSine 6 | from .upsample import ConvUpsample, ConvUpsampleShuffle 7 | from .mlp import MLP 8 | 9 | 10 | __all__ = [ 11 | "SwiGLU", 12 | "GEGLU", 13 | "CvnxtBlock", 14 | "AttentionBlock", 15 | "NystromBlock", 16 | "PositionEmbeddingSine", 17 | "ConvUpsample", 18 | "MLP", 19 | "ConvUpsampleShuffle", 20 | "AttentionDecoderBlock", 21 | ] 22 | -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/activation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class SwiGLU(nn.Module): 7 | def forward(self, x: torch.Tensor) -> torch.Tensor: 8 | x, gates = x.chunk(2, dim=-1) 9 | return x * F.silu(gates) 10 | 11 | 12 | class GEGLU(nn.Module): 13 | def forward(self, x: torch.Tensor) -> torch.Tensor: 14 | x, gates = x.chunk(2, dim=-1) 15 | return x * F.gelu(gates) 16 | -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/convnext.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class CvnxtBlock(nn.Module): 6 | def __init__( 7 | self, 8 | dim, 9 | kernel_size=7, 10 | layer_scale=1.0, 11 | expansion=4, 12 | dilation=1, 13 | ): 14 | super().__init__() 15 | self.dwconv = nn.Conv2d( 16 | dim, 17 | dim, 18 | kernel_size=kernel_size, 19 | padding="same", 20 | groups=dim, 21 | dilation=dilation, 22 | ) # depthwise conv 23 | self.norm = nn.LayerNorm(dim, eps=1e-6) 24 | self.pwconv1 = nn.Linear( 25 | dim, expansion * dim 26 | ) # pointwise/1x1 convs, implemented with linear layers 27 | self.act = nn.GELU() 28 | self.pwconv2 = nn.Linear(expansion * dim, dim) 29 | self.gamma = ( 30 | nn.Parameter(layer_scale * torch.ones((dim))) if layer_scale > 0.0 else 1.0 31 | ) 32 | 33 | def forward(self, x): 34 | input = x 35 | x = self.dwconv(x) 36 | x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) 37 | x = self.norm(x) 38 | x = self.pwconv1(x) 39 | x = self.act(x) 40 | x = self.pwconv2(x) 41 | 42 | x = self.gamma * x 43 | x = input + x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) 44 | return x 45 | -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False): 6 | if drop_prob == 0.0 or not training: 7 | return x 8 | keep_prob = 1 - drop_prob 9 | shape = (x.shape[0],) + (1,) * ( 10 | x.ndim - 1 11 | ) # work with diff dim tensors, not just 2D ConvNets 12 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 13 | if keep_prob > 0.0: 14 | random_tensor.div_(keep_prob) 15 | output = x * random_tensor 16 | return output 17 | 18 | 19 | class DropPath(nn.Module): 20 | def __init__(self, drop_prob=None): 21 | super(DropPath, self).__init__() 22 | self.drop_prob = drop_prob 23 | 24 | def forward(self, x): 25 | return drop_path(x, self.drop_prob, self.training) 26 | -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LayerScale(nn.Module): 6 | def __init__( 7 | self, 8 | dim: int, 9 | init_values = 1e-5, 10 | inplace = False, 11 | ) -> None: 12 | super().__init__() 13 | self.inplace = inplace 14 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 15 | 16 | def forward(self, x: torch.Tensor) -> torch.Tensor: 17 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 18 | -------------------------------------------------------------------------------- /Unidepth/unidepth/layers/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from unidepth.utils.misc import default 5 | from .activation import SwiGLU 6 | 7 | 8 | class MLP(nn.Module): 9 | def __init__( 10 | self, 11 | input_dim: int, 12 | expansion: int = 4, 13 | dropout: float = 0.0, 14 | gated: bool = False, 15 | output_dim = None, 16 | ): 17 | super().__init__() 18 | if gated: 19 | expansion = int(expansion * 2 / 3) 20 | hidden_dim = int(input_dim * expansion) 21 | output_dim = default(output_dim, input_dim) 22 | self.norm = nn.LayerNorm(input_dim) 23 | self.proj1 = nn.Linear(input_dim, hidden_dim) 24 | self.proj2 = nn.Linear(hidden_dim, output_dim) 25 | self.act = nn.GELU() if not gated else SwiGLU() 26 | self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity() 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | x = self.norm(x) 30 | x = self.proj1(x) 31 | x = self.act(x) 32 | x = self.proj2(x) 33 | x = self.dropout(x) 34 | return x 35 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/__init__.py: -------------------------------------------------------------------------------- 1 | # from .unidepthv1 import UniDepthV1 2 | 3 | # __all__ = [ 4 | # "UniDepthV1", 5 | # ] 6 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .convnext2 import ConvNeXtV2 2 | from .convnext import ConvNeXt 3 | from .dinov2 import _make_dinov2_model 4 | 5 | __all__ = [ 6 | "ConvNeXt", 7 | "ConvNeXtV2", 8 | "_make_dinov2_model", 9 | ] 10 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/backbones/metadinov2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .dino_head import DINOHead 8 | from .mlp import Mlp 9 | from .patch_embed import PatchEmbed 10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 11 | from .block import NestedTensorBlock 12 | from .attention import MemEffAttention 13 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/backbones/metadinov2/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | import torch.nn as nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * ( 20 | x.ndim - 1 21 | ) # work with diff dim tensors, not just 2D ConvNets 22 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 23 | if keep_prob > 0.0: 24 | random_tensor.div_(keep_prob) 25 | output = x * random_tensor 26 | return output 27 | 28 | 29 | class DropPath(nn.Module): 30 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 31 | 32 | def __init__(self, drop_prob=None): 33 | super(DropPath, self).__init__() 34 | self.drop_prob = drop_prob 35 | 36 | def forward(self, x): 37 | return drop_path(x, self.drop_prob, self.training) 38 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/backbones/metadinov2/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | import torch.nn as nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/backbones/metadinov2/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /Unidepth/unidepth/models/unidepthv1/__init__.py: -------------------------------------------------------------------------------- 1 | # from .unidepthv1 import UniDepthV1 2 | 3 | # __all__ = [ 4 | # "UniDepthV1", 5 | # ] 6 | -------------------------------------------------------------------------------- /Unidepth/unidepth/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import SILog, MSE, SelfCons 2 | from .scheduler import CosineScheduler 3 | 4 | __all__ = [ 5 | "SILog", 6 | "MSE", 7 | "SelfCons", 8 | "CosineScheduler", 9 | ] 10 | -------------------------------------------------------------------------------- /Unidepth/unidepth/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation_depth import eval_depth, DICT_METRICS 2 | from .visualization import colorize, image_grid, log_train_artifacts 3 | from .misc import format_seconds, remove_padding, get_params, identity 4 | from .distributed import ( 5 | is_main_process, 6 | setup_multi_processes, 7 | setup_slurm, 8 | sync_tensor_across_gpus, 9 | barrier, 10 | get_rank, 11 | get_dist_info, 12 | ) 13 | from .geometric import unproject_points, spherical_zbuffer_to_euclidean 14 | 15 | __all__ = [ 16 | "eval_depth", 17 | "DICT_METRICS", 18 | "colorize", 19 | "image_grid", 20 | "log_train_artifacts", 21 | "format_seconds", 22 | "remove_padding", 23 | "get_params", 24 | "identity", 25 | "is_main_process", 26 | "setup_multi_processes", 27 | "setup_slurm", 28 | "sync_tensor_across_gpus", 29 | "barrier", 30 | "get_rank", 31 | "unproject_points", 32 | "spherical_zbuffer_to_euclidean", 33 | "validate", 34 | "get_dist_info", 35 | ] 36 | -------------------------------------------------------------------------------- /Unidepth/unidepth/utils/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Luigi Piccinelli 3 | Licensed under the CC-BY NC 4.0 license (http://creativecommons.org/licenses/by-nc/4.0/) 4 | """ 5 | 6 | import math 7 | import torch 8 | 9 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 10 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 11 | IMAGENET_DATASET_MEAN = (0.485, 0.456, 0.406) 12 | IMAGENET_DATASET_STD = (0.229, 0.224, 0.225) 13 | DEPTH_BINS = torch.cat( 14 | ( 15 | torch.logspace(math.log10(0.1), math.log10(180.0), steps=512), 16 | torch.tensor([260.0]), 17 | ), 18 | dim=0, 19 | ) 20 | LOGERR_BINS = torch.linspace(-2, 2, steps=128 + 1) 21 | LINERR_BINS = torch.linspace(-50, 50, steps=256 + 1) 22 | -------------------------------------------------------------------------------- /VPD/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Wenliang Zhao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /VPD/README.md: -------------------------------------------------------------------------------- 1 | # VPD 2 | 3 | 1. Follow [VPD](https://github.com/wl-zhao/VPD) Installation section (download stable diffusion models and install stable-diffusion package). The VPD repo uses SD v1-5 4 | 5 | 2. Follow [VPD depth](https://github.com/wl-zhao/VPD/blob/main/depth/README.md) first step to install mmcv and requirements. Then download [VPD depth pretrained](https://cloud.tsinghua.edu.cn/f/7e4adc76cc9b4200ac79/?dl=1) and put it under checkpoints/ 6 | 7 | 3. Download InSpaceType eval set and put it under root folder. 8 | 9 | 4. 10 | 11 | ``` 12 | cd depth 13 | 14 | bash test.sh ../checkpoints/vpd_depth_480x480.pth 15 | ``` 16 | 17 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. -------------------------------------------------------------------------------- /VPD/depth/README.md: -------------------------------------------------------------------------------- 1 | # Depth Estimation with VPD 2 | ## Getting Started 3 | 4 | 1. Install the [mmcv-full](https://github.com/open-mmlab/mmcv) library and some required packages. 5 | 6 | ```bash 7 | pip install openmim 8 | mim install mmcv-full 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | 2. Prepare NYUDepthV2 datasets following [GLPDepth](https://github.com/vinvino02/GLPDepth) and [BTS](https://github.com/cleinc/bts/tree/master). 13 | 14 | ``` 15 | mkdir nyu_depth_v2 16 | wget http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat 17 | python extract_official_train_test_set_from_mat.py nyu_depth_v2_labeled.mat splits.mat ./nyu_depth_v2/official_splits/ 18 | ``` 19 | 20 | Download sync.zip provided by the authors of BTS from this [url](https://drive.google.com/file/d/1AysroWpfISmm-yRFGBgFTrLy6FjQwvwP/view) and unzip in `./nyu_depth_v2` folder. 21 | 22 | Your dataset directory should be: 23 | 24 | ``` 25 | │nyu_depth_v2/ 26 | ├──official_splits/ 27 | │ ├── test 28 | │ ├── train 29 | ├──sync/ 30 | ``` 31 | 32 | ## Results and Fine-tuned Models 33 | 34 | | | RMSE | d1 | d2 | d3 | REL | log_10 | Fine-tuned Model | 35 | |-------------------|-------|-------|--------|--------|--------|-------|-------| 36 | | **VPD** | 0.254 | 0.964 | 0.995 | 0.999 | 0.069 | 0.030 |[Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/7e4adc76cc9b4200ac79/?dl=1) | 37 | 38 | We offer the predicted depths in 16-bit format for NYU-Depth-v2 official test set [here](https://cloud.tsinghua.edu.cn/f/27354f47ba424bb3ad40/?dl=1). 39 | 40 | ## Training 41 | 42 | Run the following instuction to train the VPD-Depth model. We recommend using 8 NVIDIA V100 GPUs to train the model with a total batch size of 24. 43 | 44 | ``` 45 | bash train.sh 46 | ``` 47 | 48 | ## Evaluation 49 | Command format: 50 | ``` 51 | bash test.sh 52 | ``` -------------------------------------------------------------------------------- /VPD/depth/configs/test_options.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | from configs.base_options import BaseOptions 7 | 8 | class TestOptions(BaseOptions): 9 | def initialize(self): 10 | parser = BaseOptions.initialize(self) 11 | 12 | # experiment configs 13 | parser.add_argument('--ckpt_dir', type=str, 14 | default='./ckpt/best_model_nyu.ckpt', 15 | help='load ckpt path') 16 | parser.add_argument('--result_dir', type=str, default='./results', 17 | help='save result images into result_dir/exp_name') 18 | parser.add_argument('--crop_h', type=int, default=448) 19 | parser.add_argument('--crop_w', type=int, default=576) 20 | 21 | parser.add_argument('--save_eval_pngs', action='store_true', 22 | help='save result image into evaluation form') 23 | parser.add_argument('--save_visualize', action='store_true', 24 | help='save result image into visulized form') 25 | return parser 26 | 27 | 28 | -------------------------------------------------------------------------------- /VPD/depth/dataset/filenames/nyudepthv2/split_files_sml.txt: -------------------------------------------------------------------------------- 1 | /home/choyingw/Documents/ZED/1000_select_split/0001_L.jpg 2 | /home/choyingw/Documents/ZED/1000_select_split/0002_L.jpg 3 | /home/choyingw/Documents/ZED/1000_select_split/0003_L.jpg 4 | /home/choyingw/Documents/ZED/1000_select_split/0004_L.jpg 5 | /home/choyingw/Documents/ZED/1000_select_split/0005_L.jpg 6 | /home/choyingw/Documents/ZED/1000_select_split/0006_L.jpg 7 | /home/choyingw/Documents/ZED/1000_select_split/0007_L.jpg 8 | /home/choyingw/Documents/ZED/1000_select_split/0008_L.jpg 9 | /home/choyingw/Documents/ZED/1000_select_split/0009_L.jpg 10 | /home/choyingw/Documents/ZED/1000_select_split/0010_L.jpg 11 | /home/choyingw/Documents/ZED/1000_select_split/0011_L.jpg 12 | /home/choyingw/Documents/ZED/1000_select_split/0012_L.jpg 13 | /home/choyingw/Documents/ZED/1000_select_split/0013_L.jpg 14 | -------------------------------------------------------------------------------- /VPD/depth/dataset/imagepath.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | import os 7 | import cv2 8 | from torch.utils.data import Dataset 9 | import torchvision.transforms as transforms 10 | 11 | class imagepath(Dataset): 12 | # for test only 13 | def __init__(self, data_path): 14 | super().__init__() 15 | 16 | self.data_path = data_path 17 | self.to_tensor = transforms.ToTensor() 18 | 19 | self.filenames_list = [os.path.join(data_path, i) for i in os.listdir(data_path) 20 | if i.split('.')[-1] in ['jpg', 'png']] 21 | 22 | print("Dataset : Image Path") 23 | print("# of images: %d" % (len(self.filenames_list))) 24 | 25 | def __len__(self): 26 | return len(self.filenames_list) 27 | 28 | def __getitem__(self, idx): 29 | batch = {} 30 | file = self.filenames_list[idx] 31 | filename = file.split('/')[-1] 32 | 33 | image = cv2.imread(file) # [H x W x C] and C: BGR 34 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 35 | 36 | # input size should be multiple of 32 37 | h, w, c = image.shape 38 | new_h, new_w = h // 32 * 32, w // 32 * 32 39 | image = cv2.resize(image, (new_w, new_h)) 40 | image = self.to_tensor(image) 41 | 42 | batch['image'] = image 43 | batch['filename'] = filename 44 | 45 | return batch 46 | -------------------------------------------------------------------------------- /VPD/depth/nyu_class_embeddings.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/nyu_class_embeddings.pth -------------------------------------------------------------------------------- /VPD/depth/nyu_class_list.json: -------------------------------------------------------------------------------- 1 | ["printer_room", 2 | "bathroom", 3 | "living_room", 4 | "study", 5 | "conference_room", 6 | "study_room", 7 | "kitchen", 8 | "home_office", 9 | "bedroom", 10 | "dinette", 11 | "playroom", 12 | "indoor_balcony", 13 | "laundry_room", 14 | "basement", 15 | "excercise_room", 16 | "foyer", 17 | "home_storage", 18 | "cafe", 19 | "furniture_store", 20 | "office_kitchen", 21 | "student_lounge", 22 | "dining_room", 23 | "reception_room", 24 | "computer_lab", 25 | "classroom", 26 | "office", 27 | "bookstore"] -------------------------------------------------------------------------------- /VPD/depth/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.6.0 2 | h5py>=3.6.0 3 | scipy>=1.7.3 4 | opencv-python>=4.5.5 5 | timm>=0.5.4 6 | albumentations>=1.1.0 7 | tensorboardX>=2.4.1 8 | gdown>=4.2.1 -------------------------------------------------------------------------------- /VPD/depth/splits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/splits.mat -------------------------------------------------------------------------------- /VPD/depth/src/clip/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | *.egg-info 5 | .pytest_cache 6 | .ipynb_checkpoints 7 | 8 | thumbs.db 9 | .DS_Store 10 | .idea 11 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include clip/bpe_simple_vocab_16e6.txt.gz 2 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/depth/src/clip/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /VPD/depth/src/clip/data/country211.md: -------------------------------------------------------------------------------- 1 | # The Country211 Dataset 2 | 3 | In the paper, we used an image classification dataset called Country211, to evaluate the model's capability on geolocation. To do so, we filtered the YFCC100m dataset that have GPS coordinate corresponding to a [ISO-3166 country code](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) and created a balanced dataset by sampling 150 train images, 50 validation images, and 100 test images images for each country. 4 | 5 | The following command will download an 11GB archive countaining the images and extract into a subdirectory `country211`: 6 | 7 | ```bash 8 | wget https://openaipublic.azureedge.net/clip/data/country211.tgz 9 | tar zxvf country211.tgz 10 | ``` 11 | 12 | These images are a subset of the YFCC100m dataset. Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). -------------------------------------------------------------------------------- /VPD/depth/src/clip/data/rendered-sst2.md: -------------------------------------------------------------------------------- 1 | # The Rendered SST2 Dataset 2 | 3 | In the paper, we used an image classification dataset called Rendered SST2, to evaluate the model's capability on optical character recognition. To do so, we rendered the sentences in the [Standford Sentiment Treebank v2](https://nlp.stanford.edu/sentiment/treebank.html) dataset and used those as the input to the CLIP image encoder. 4 | 5 | The following command will download a 131MB archive countaining the images and extract into a subdirectory `rendered-sst2`: 6 | 7 | ```bash 8 | wget https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz 9 | tar zxvf rendered-sst2.tgz 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/data/yfcc100m.md: -------------------------------------------------------------------------------- 1 | # The YFCC100M Subset 2 | 3 | In the paper, we performed a dataset ablation using a subset of the YFCC100M dataset and showed that the performance remained largely similar. 4 | 5 | The subset contains 14,829,396 images, about 15% of the full dataset, which have been filtered to only keep those with natural languag titles and/or descriptions in English. 6 | 7 | We provide the list of (line number, photo identifier, photo hash) of each image contained in this subset. These correspond to the first three columns in the dataset's metadata TSV file. 8 | 9 | ```bash 10 | wget https://openaipublic.azureedge.net/clip/data/yfcc100m_subset_data.tsv.bz2 11 | bunzip2 yfcc100m_subset_data.tsv.bz2 12 | ``` 13 | 14 | Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/). -------------------------------------------------------------------------------- /VPD/depth/src/clip/hubconf.py: -------------------------------------------------------------------------------- 1 | from clip.clip import tokenize as _tokenize, load as _load, available_models as _available_models 2 | import re 3 | import string 4 | 5 | dependencies = ["torch", "torchvision", "ftfy", "regex", "tqdm"] 6 | 7 | # For compatibility (cannot include special characters in function name) 8 | model_functions = { model: re.sub(f'[{string.punctuation}]', '_', model) for model in _available_models()} 9 | 10 | def _create_hub_entrypoint(model): 11 | def entrypoint(**kwargs): 12 | return _load(model, **kwargs) 13 | 14 | entrypoint.__doc__ = f"""Loads the {model} CLIP model 15 | 16 | Parameters 17 | ---------- 18 | device : Union[str, torch.device] 19 | The device to put the loaded model 20 | 21 | jit : bool 22 | Whether to load the optimized JIT model or more hackable non-JIT model (default). 23 | 24 | download_root: str 25 | path to download the model files; by default, it uses "~/.cache/clip" 26 | 27 | Returns 28 | ------- 29 | model : torch.nn.Module 30 | The {model} CLIP model 31 | 32 | preprocess : Callable[[PIL.Image], torch.Tensor] 33 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 34 | """ 35 | return entrypoint 36 | 37 | def tokenize(): 38 | return _tokenize 39 | 40 | _entrypoints = {model_functions[model]: _create_hub_entrypoint(model) for model in _available_models()} 41 | 42 | globals().update(_entrypoints) -------------------------------------------------------------------------------- /VPD/depth/src/clip/requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | regex 3 | tqdm 4 | torch 5 | torchvision 6 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name="clip", 8 | py_modules=["clip"], 9 | version="1.0", 10 | description="", 11 | author="OpenAI", 12 | packages=find_packages(exclude=["tests*"]), 13 | install_requires=[ 14 | str(r) 15 | for r in pkg_resources.parse_requirements( 16 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 17 | ) 18 | ], 19 | include_package_data=True, 20 | extras_require={'dev': ['pytest']}, 21 | ) 22 | -------------------------------------------------------------------------------- /VPD/depth/src/clip/tests/test_consistency.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from PIL import Image 5 | 6 | import clip 7 | 8 | 9 | @pytest.mark.parametrize('model_name', clip.available_models()) 10 | def test_consistency(model_name): 11 | device = "cpu" 12 | jit_model, transform = clip.load(model_name, device=device, jit=True) 13 | py_model, _ = clip.load(model_name, device=device, jit=False) 14 | 15 | image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device) 16 | text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) 17 | 18 | with torch.no_grad(): 19 | logits_per_image, _ = jit_model(image, text) 20 | jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy() 21 | 22 | logits_per_image, _ = py_model(image, text) 23 | py_probs = logits_per_image.softmax(dim=-1).cpu().numpy() 24 | 25 | assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1) 26 | -------------------------------------------------------------------------------- /VPD/depth/test.sh: -------------------------------------------------------------------------------- 1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 2 | python3 -m torch.distributed.launch --nproc_per_node=1 \ 3 | --use_env test.py --dataset nyudepthv2 --data_path ./ \ 4 | --max_depth 10.0 --max_depth_eval 10.0 \ 5 | --num_filters 32 32 32 --deconv_kernels 2 2 2\ 6 | --flip_test --shift_window_test\ 7 | --shift_size 2 --ckpt_dir $1\ 8 | --crop_h 480 --crop_w 480 ${@:2} 9 | 10 | 11 | # -------------------------------------------------------------------------------- /VPD/depth/train.sh: -------------------------------------------------------------------------------- 1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 2 | python3 -m torch.distributed.launch --nproc_per_node=8 \ 3 | --use_env train.py --batch_size 3 --dataset nyudepthv2 --data_path ./ \ 4 | --max_depth 10.0 --max_depth_eval 10.0 --weight_decay 0.1 \ 5 | --num_filters 32 32 32 --deconv_kernels 2 2 2\ 6 | --flip_test --shift_window_test \ 7 | --shift_size 2 --save_model --layer_decay 0.9 --drop_path_rate 0.3 --log_dir $1 \ 8 | --crop_h 480 --crop_w 480 --epochs 25 ${@:2} -------------------------------------------------------------------------------- /VPD/depth/utils_depth/criterion.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # The code is from GLPDepth (https://github.com/vinvino02/GLPDepth). 3 | # For non-commercial purpose only (research, evaluation etc). 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class SiLogLoss(nn.Module): 11 | def __init__(self, lambd=0.5): 12 | super().__init__() 13 | self.lambd = lambd 14 | 15 | def forward(self, pred, target): 16 | valid_mask = (target > 0).detach() 17 | diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask]) 18 | loss = torch.sqrt(torch.pow(diff_log, 2).mean() - 19 | self.lambd * torch.pow(diff_log.mean(), 2)) 20 | 21 | return loss 22 | 23 | -------------------------------------------------------------------------------- /VPD/refer/models_refer/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .model import VPDRefer -------------------------------------------------------------------------------- /VPD/refer/refer/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | # install pycocotools/mask locally 3 | # copy from https://github.com/pdollar/coco.git 4 | python setup.py build_ext --inplace 5 | rm -rf build 6 | 7 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'licheng' 2 | 3 | 4 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/readme.txt: -------------------------------------------------------------------------------- 1 | This folder contains modified coco-caption evaluation, which is downloaded from https://github.com/tylin/coco-caption.git 2 | and refEvaluation which is to be called by the refer algorithm. 3 | 4 | More specifically, this folder contains: 5 | 1. bleu/ 6 | 2. cider/ 7 | 3. meteor/ 8 | 4. rouge/ 9 | 5. tokenizer/ 10 | 6. __init__.py 11 | 7. refEvaluation.py 12 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/refer/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /VPD/refer/refer/external/README.md: -------------------------------------------------------------------------------- 1 | The codes inside this folder are copied from pycocotools: https://github.com/pdollar/coco -------------------------------------------------------------------------------- /VPD/refer/refer/external/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /VPD/refer/refer/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | from distutils.extension import Extension 4 | import numpy as np 5 | 6 | ext_modules = [ 7 | Extension( 8 | 'external._mask', 9 | sources=['external/maskApi.c', 'external/_mask.pyx'], 10 | include_dirs = [np.get_include(), 'external'], 11 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 12 | ) 13 | ] 14 | 15 | setup( 16 | name='external', 17 | packages=['external'], 18 | package_dir = {'external': 'external'}, 19 | version='2.0', 20 | ext_modules=cythonize(ext_modules) 21 | ) 22 | -------------------------------------------------------------------------------- /VPD/refer/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | filelock 3 | tqdm 4 | timm 5 | ftfy 6 | regex 7 | scipy 8 | scikit-image 9 | pycocotools==2.0.2 10 | opencv-python==4.5.3.56 11 | tokenizers 12 | h5py -------------------------------------------------------------------------------- /VPD/refer/test.sh: -------------------------------------------------------------------------------- 1 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 2 | python3 test.py \ 3 | --dataset $1 --split val --resume $2 \ 4 | --workers 4 --ddp_trained_weights --img_size 512 ${@:3} -------------------------------------------------------------------------------- /VPD/refer/train.sh: -------------------------------------------------------------------------------- 1 | logdir=$2 2 | mkdir -p $logdir 3 | 4 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 5 | python3 -m torch.distributed.launch --nproc_per_node $3 --master_port 12345 train.py \ 6 | --dataset $1 --model_id $1 \ 7 | --batch-size 4 --lr 0.00005 --wd 1e-2 \ 8 | --epochs 40 --img_size 512 ${@:4} \ 9 | 2>&1 | tee $logdir/log.txt -------------------------------------------------------------------------------- /VPD/segmentation/README.md: -------------------------------------------------------------------------------- 1 | # Semantic Segmentation with VPD 2 | ## Getting Started 3 | 4 | 1. Install the [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) library and some required packages. 5 | 6 | ```bash 7 | pip install openmim 8 | mim install mmcv-full 9 | mim install mmsegmentation 10 | ``` 11 | 12 | 2. Follow the guide in [mmseg](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/dataset_prepare.md) to prepare the ADE20k dataset. 13 | 14 | 15 | ## Results and Fine-tuned Models 16 | 17 | | Model | Config | Head | Crop Size | Lr Schd | mIoU | mIoU (ms+flip) | Fine-tuned Model | 18 | |:---:|:---:|:---:|:---:|:---:| :---:|:---:|:---:| 19 | | ```VPDSeg_SD-1-5``` | [config](configs/fpn_vpd_sd1-5_512x512_gpu8x2.py) | Semantic FPN | 512x512 | 80K | 53.7 | 54.6 | [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/78ca31e53c5549779abd/?dl=1) | 20 | 21 | ## Training 22 | ``` 23 | bash dist_train.sh 24 | ``` 25 | We use 8 GPUs by default. 26 | 27 | ## Evaluation 28 | Command format: 29 | ``` 30 | bash dist_test.sh --eval mIoU 31 | ``` 32 | To evaluate a model with multi-scale and flip, run 33 | ``` 34 | bash dist_test.sh --eval mIoU --aug-test 35 | ``` 36 | -------------------------------------------------------------------------------- /VPD/segmentation/class_embeddings.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/segmentation/class_embeddings.pth -------------------------------------------------------------------------------- /VPD/segmentation/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | find_unused_parameters = True 16 | 17 | -------------------------------------------------------------------------------- /VPD/segmentation/configs/_base_/models/fpn_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | neck=dict( 18 | type='FPN', 19 | in_channels=[256, 512, 1024, 2048], 20 | out_channels=256, 21 | num_outs=4), 22 | decode_head=dict( 23 | type='FPNHead', 24 | in_channels=[256, 256, 256, 256], 25 | in_index=[0, 1, 2, 3], 26 | feature_strides=[4, 8, 16, 32], 27 | channels=256, 28 | dropout_ratio=0.1, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | align_corners=False, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 34 | # model training and testing settings 35 | train_cfg=dict(), 36 | test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)) 37 | ) -------------------------------------------------------------------------------- /VPD/segmentation/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-5, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /VPD/segmentation/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-6, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /VPD/segmentation/configs/fpn_vpd_sd1-5_512x512_gpu8x2.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/fpn_r50.py', '_base_/datasets/ade20k_vpd.py', 3 | '_base_/default_runtime.py', '_base_/schedules/schedule_80k.py' 4 | ] 5 | 6 | model = dict( 7 | type='VPDSeg', 8 | sd_path='checkpoints/v1-5-pruned-emaonly.ckpt', 9 | neck=dict( 10 | type='FPN', 11 | in_channels=[320, 790, 1430, 1280], 12 | out_channels=256, 13 | num_outs=4), 14 | decode_head=dict( 15 | type='FPNHead', 16 | num_classes=150, 17 | loss_decode=dict( 18 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 19 | ) 20 | 21 | lr_config = dict(policy='poly', power=1, min_lr=0.0, by_epoch=False, 22 | warmup='linear', 23 | warmup_iters=1500, 24 | warmup_ratio=1e-6) 25 | 26 | 27 | optimizer = dict(type='AdamW', lr=0.00008, weight_decay=0.001, 28 | paramwise_cfg=dict(custom_keys={'unet': dict(lr_mult=0.1), 29 | 'encoder_vq': dict(lr_mult=0.0), 30 | 'text_encoder': dict(lr_mult=0.0), 31 | 'norm': dict(decay_mult=0.)})) 32 | 33 | data = dict(samples_per_gpu=2, workers_per_gpu=8) 34 | -------------------------------------------------------------------------------- /VPD/segmentation/dist_test.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | CHECKPOINT=$2 3 | GPUS=$3 4 | NNODES=${NNODES:-1} 5 | NODE_RANK=${NODE_RANK:-0} 6 | PORT=${PORT:-29500} 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 8 | 9 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 10 | python3 -m torch.distributed.launch \ 11 | --nnodes=$NNODES \ 12 | --node_rank=$NODE_RANK \ 13 | --master_addr=$MASTER_ADDR \ 14 | --nproc_per_node=$GPUS \ 15 | --master_port=$PORT \ 16 | $(dirname "$0")/test.py \ 17 | $CONFIG \ 18 | $CHECKPOINT \ 19 | --launcher pytorch \ 20 | ${@:4} 21 | -------------------------------------------------------------------------------- /VPD/segmentation/dist_train.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | GPUS=$2 3 | NNODES=${NNODES:-1} 4 | NODE_RANK=${NODE_RANK:-0} 5 | PORT=${PORT:-29500} 6 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 7 | 8 | PYTHONPATH="$(dirname $0)/..":"$(dirname $0)/../stable-diffusion":$PYTHONPATH \ 9 | python3 -m torch.distributed.launch \ 10 | --nnodes=$NNODES \ 11 | --node_rank=$NODE_RANK \ 12 | --master_addr=$MASTER_ADDR \ 13 | --nproc_per_node=$GPUS \ 14 | --master_port=$PORT \ 15 | $(dirname "$0")/train.py \ 16 | $CONFIG \ 17 | --launcher pytorch ${@:3} 18 | -------------------------------------------------------------------------------- /VPD/segmentation/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .vpd_seg import VPDSeg -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_16x16x16.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: "val/rec_loss" 6 | embed_dim: 16 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 0.000001 12 | disc_weight: 0.5 13 | 14 | ddconfig: 15 | double_z: True 16 | z_channels: 16 17 | resolution: 256 18 | in_channels: 3 19 | out_ch: 3 20 | ch: 128 21 | ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 22 | num_res_blocks: 2 23 | attn_resolutions: [16] 24 | dropout: 0.0 25 | 26 | 27 | data: 28 | target: main.DataModuleFromConfig 29 | params: 30 | batch_size: 12 31 | wrap: True 32 | train: 33 | target: ldm.data.imagenet.ImageNetSRTrain 34 | params: 35 | size: 256 36 | degradation: pil_nearest 37 | validation: 38 | target: ldm.data.imagenet.ImageNetSRValidation 39 | params: 40 | size: 256 41 | degradation: pil_nearest 42 | 43 | lightning: 44 | callbacks: 45 | image_logger: 46 | target: main.ImageLogger 47 | params: 48 | batch_frequency: 1000 49 | max_images: 8 50 | increase_log_steps: True 51 | 52 | trainer: 53 | benchmark: True 54 | accumulate_grad_batches: 2 55 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_32x32x4.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: "val/rec_loss" 6 | embed_dim: 4 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 0.000001 12 | disc_weight: 0.5 13 | 14 | ddconfig: 15 | double_z: True 16 | z_channels: 4 17 | resolution: 256 18 | in_channels: 3 19 | out_ch: 3 20 | ch: 128 21 | ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 22 | num_res_blocks: 2 23 | attn_resolutions: [ ] 24 | dropout: 0.0 25 | 26 | data: 27 | target: main.DataModuleFromConfig 28 | params: 29 | batch_size: 12 30 | wrap: True 31 | train: 32 | target: ldm.data.imagenet.ImageNetSRTrain 33 | params: 34 | size: 256 35 | degradation: pil_nearest 36 | validation: 37 | target: ldm.data.imagenet.ImageNetSRValidation 38 | params: 39 | size: 256 40 | degradation: pil_nearest 41 | 42 | lightning: 43 | callbacks: 44 | image_logger: 45 | target: main.ImageLogger 46 | params: 47 | batch_frequency: 1000 48 | max_images: 8 49 | increase_log_steps: True 50 | 51 | trainer: 52 | benchmark: True 53 | accumulate_grad_batches: 2 54 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_64x64x3.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: "val/rec_loss" 6 | embed_dim: 3 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 0.000001 12 | disc_weight: 0.5 13 | 14 | ddconfig: 15 | double_z: True 16 | z_channels: 3 17 | resolution: 256 18 | in_channels: 3 19 | out_ch: 3 20 | ch: 128 21 | ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 22 | num_res_blocks: 2 23 | attn_resolutions: [ ] 24 | dropout: 0.0 25 | 26 | 27 | data: 28 | target: main.DataModuleFromConfig 29 | params: 30 | batch_size: 12 31 | wrap: True 32 | train: 33 | target: ldm.data.imagenet.ImageNetSRTrain 34 | params: 35 | size: 256 36 | degradation: pil_nearest 37 | validation: 38 | target: ldm.data.imagenet.ImageNetSRValidation 39 | params: 40 | size: 256 41 | degradation: pil_nearest 42 | 43 | lightning: 44 | callbacks: 45 | image_logger: 46 | target: main.ImageLogger 47 | params: 48 | batch_frequency: 1000 49 | max_images: 8 50 | increase_log_steps: True 51 | 52 | trainer: 53 | benchmark: True 54 | accumulate_grad_batches: 2 55 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/autoencoder/autoencoder_kl_8x8x64.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: "val/rec_loss" 6 | embed_dim: 64 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 0.000001 12 | disc_weight: 0.5 13 | 14 | ddconfig: 15 | double_z: True 16 | z_channels: 64 17 | resolution: 256 18 | in_channels: 3 19 | out_ch: 3 20 | ch: 128 21 | ch_mult: [ 1,1,2,2,4,4] # num_down = len(ch_mult)-1 22 | num_res_blocks: 2 23 | attn_resolutions: [16,8] 24 | dropout: 0.0 25 | 26 | data: 27 | target: main.DataModuleFromConfig 28 | params: 29 | batch_size: 12 30 | wrap: True 31 | train: 32 | target: ldm.data.imagenet.ImageNetSRTrain 33 | params: 34 | size: 256 35 | degradation: pil_nearest 36 | validation: 37 | target: ldm.data.imagenet.ImageNetSRValidation 38 | params: 39 | size: 256 40 | degradation: pil_nearest 41 | 42 | lightning: 43 | callbacks: 44 | image_logger: 45 | target: main.ImageLogger 46 | params: 47 | batch_frequency: 1000 48 | max_images: 8 49 | increase_log_steps: True 50 | 51 | trainer: 52 | benchmark: True 53 | accumulate_grad_batches: 2 54 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/latent-diffusion/cin256-v2.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 0.0001 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0195 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: class_label 12 | image_size: 64 13 | channels: 3 14 | cond_stage_trainable: true 15 | conditioning_key: crossattn 16 | monitor: val/loss 17 | use_ema: False 18 | 19 | unet_config: 20 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 21 | params: 22 | image_size: 64 23 | in_channels: 3 24 | out_channels: 3 25 | model_channels: 192 26 | attention_resolutions: 27 | - 8 28 | - 4 29 | - 2 30 | num_res_blocks: 2 31 | channel_mult: 32 | - 1 33 | - 2 34 | - 3 35 | - 5 36 | num_heads: 1 37 | use_spatial_transformer: true 38 | transformer_depth: 1 39 | context_dim: 512 40 | 41 | first_stage_config: 42 | target: ldm.models.autoencoder.VQModelInterface 43 | params: 44 | embed_dim: 3 45 | n_embed: 8192 46 | ddconfig: 47 | double_z: false 48 | z_channels: 3 49 | resolution: 256 50 | in_channels: 3 51 | out_ch: 3 52 | ch: 128 53 | ch_mult: 54 | - 1 55 | - 2 56 | - 4 57 | num_res_blocks: 2 58 | attn_resolutions: [] 59 | dropout: 0.0 60 | lossconfig: 61 | target: torch.nn.Identity 62 | 63 | cond_stage_config: 64 | target: ldm.modules.encoders.modules.ClassEmbedder 65 | params: 66 | n_classes: 1001 67 | embed_dim: 512 68 | key: class_label 69 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 5.0e-05 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.012 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: caption 12 | image_size: 32 13 | channels: 4 14 | cond_stage_trainable: true 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | use_ema: False 19 | 20 | unet_config: 21 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 22 | params: 23 | image_size: 32 24 | in_channels: 4 25 | out_channels: 4 26 | model_channels: 320 27 | attention_resolutions: 28 | - 4 29 | - 2 30 | - 1 31 | num_res_blocks: 2 32 | channel_mult: 33 | - 1 34 | - 2 35 | - 4 36 | - 4 37 | num_heads: 8 38 | use_spatial_transformer: true 39 | transformer_depth: 1 40 | context_dim: 1280 41 | use_checkpoint: true 42 | legacy: False 43 | 44 | first_stage_config: 45 | target: ldm.models.autoencoder.AutoencoderKL 46 | params: 47 | embed_dim: 4 48 | monitor: val/rec_loss 49 | ddconfig: 50 | double_z: true 51 | z_channels: 4 52 | resolution: 256 53 | in_channels: 3 54 | out_ch: 3 55 | ch: 128 56 | ch_mult: 57 | - 1 58 | - 2 59 | - 4 60 | - 4 61 | num_res_blocks: 2 62 | attn_resolutions: [] 63 | dropout: 0.0 64 | lossconfig: 65 | target: torch.nn.Identity 66 | 67 | cond_stage_config: 68 | target: ldm.modules.encoders.modules.BERTEmbedder 69 | params: 70 | n_embed: 1280 71 | n_layer: 32 72 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/configs/retrieval-augmented-diffusion/768x768.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 0.0001 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.015 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: jpg 11 | cond_stage_key: nix 12 | image_size: 48 13 | channels: 16 14 | cond_stage_trainable: false 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_by_std: false 18 | scale_factor: 0.22765929 19 | unet_config: 20 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 21 | params: 22 | image_size: 48 23 | in_channels: 16 24 | out_channels: 16 25 | model_channels: 448 26 | attention_resolutions: 27 | - 4 28 | - 2 29 | - 1 30 | num_res_blocks: 2 31 | channel_mult: 32 | - 1 33 | - 2 34 | - 3 35 | - 4 36 | use_scale_shift_norm: false 37 | resblock_updown: false 38 | num_head_channels: 32 39 | use_spatial_transformer: true 40 | transformer_depth: 1 41 | context_dim: 768 42 | use_checkpoint: true 43 | first_stage_config: 44 | target: ldm.models.autoencoder.AutoencoderKL 45 | params: 46 | monitor: val/rec_loss 47 | embed_dim: 16 48 | ddconfig: 49 | double_z: true 50 | z_channels: 16 51 | resolution: 256 52 | in_channels: 3 53 | out_ch: 3 54 | ch: 128 55 | ch_mult: 56 | - 1 57 | - 1 58 | - 2 59 | - 2 60 | - 4 61 | num_res_blocks: 2 62 | attn_resolutions: 63 | - 16 64 | dropout: 0.0 65 | lossconfig: 66 | target: torch.nn.Identity 67 | cond_stage_config: 68 | target: torch.nn.Identity -------------------------------------------------------------------------------- /VPD/stable-diffusion/environment.yaml: -------------------------------------------------------------------------------- 1 | name: ldm 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.8.5 7 | - pip=20.3 8 | - cudatoolkit=11.3 9 | - pytorch=1.11.0 10 | - torchvision=0.12.0 11 | - numpy=1.19.2 12 | - pip: 13 | - albumentations==0.4.3 14 | - diffusers 15 | - opencv-python==4.1.2.30 16 | - pudb==2019.2 17 | - invisible-watermark 18 | - imageio==2.9.0 19 | - imageio-ffmpeg==0.4.2 20 | - pytorch-lightning==1.4.2 21 | - omegaconf==2.1.1 22 | - test-tube>=0.7.5 23 | - streamlit>=0.73.1 24 | - einops==0.3.0 25 | - torch-fidelity==0.3.0 26 | - transformers==4.19.2 27 | - torchmetrics==0.6.0 28 | - kornia==0.6 29 | - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers 30 | - -e git+https://github.com/openai/CLIP.git@main#egg=clip 31 | - -e . 32 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/data/__init__.py -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/data/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset 3 | 4 | 5 | class Txt2ImgIterableBaseDataset(IterableDataset): 6 | ''' 7 | Define an interface to make the IterableDatasets for text2img data chainable 8 | ''' 9 | def __init__(self, num_records=0, valid_ids=None, size=256): 10 | super().__init__() 11 | self.num_records = num_records 12 | self.valid_ids = valid_ids 13 | self.sample_ids = valid_ids 14 | self.size = size 15 | 16 | print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.') 17 | 18 | def __len__(self): 19 | return self.num_records 20 | 21 | @abstractmethod 22 | def __iter__(self): 23 | pass -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/VPD/stable-diffusion/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /VPD/stable-diffusion/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/kl-f16/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 16 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 16 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | num_res_blocks: 2 27 | attn_resolutions: 28 | - 16 29 | dropout: 0.0 30 | data: 31 | target: main.DataModuleFromConfig 32 | params: 33 | batch_size: 6 34 | wrap: true 35 | train: 36 | target: ldm.data.openimages.FullOpenImagesTrain 37 | params: 38 | size: 384 39 | crop_size: 256 40 | validation: 41 | target: ldm.data.openimages.FullOpenImagesValidation 42 | params: 43 | size: 384 44 | crop_size: 256 45 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/kl-f32/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 64 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 64 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | - 4 27 | num_res_blocks: 2 28 | attn_resolutions: 29 | - 16 30 | - 8 31 | dropout: 0.0 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 6 36 | wrap: true 37 | train: 38 | target: ldm.data.openimages.FullOpenImagesTrain 39 | params: 40 | size: 384 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | size: 384 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/kl-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 3 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 3 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | num_res_blocks: 2 25 | attn_resolutions: [] 26 | dropout: 0.0 27 | data: 28 | target: main.DataModuleFromConfig 29 | params: 30 | batch_size: 10 31 | wrap: true 32 | train: 33 | target: ldm.data.openimages.FullOpenImagesTrain 34 | params: 35 | size: 384 36 | crop_size: 256 37 | validation: 38 | target: ldm.data.openimages.FullOpenImagesValidation 39 | params: 40 | size: 384 41 | crop_size: 256 42 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/kl-f8/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 4 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 4 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | - 4 25 | num_res_blocks: 2 26 | attn_resolutions: [] 27 | dropout: 0.0 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 4 32 | wrap: true 33 | train: 34 | target: ldm.data.openimages.FullOpenImagesTrain 35 | params: 36 | size: 384 37 | crop_size: 256 38 | validation: 39 | target: ldm.data.openimages.FullOpenImagesValidation 40 | params: 41 | size: 384 42 | crop_size: 256 43 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/vq-f16/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 8 6 | n_embed: 16384 7 | ddconfig: 8 | double_z: false 9 | z_channels: 8 10 | resolution: 256 11 | in_channels: 3 12 | out_ch: 3 13 | ch: 128 14 | ch_mult: 15 | - 1 16 | - 1 17 | - 2 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: 22 | - 16 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 250001 30 | disc_weight: 0.75 31 | disc_num_layers: 2 32 | codebook_weight: 1.0 33 | 34 | data: 35 | target: main.DataModuleFromConfig 36 | params: 37 | batch_size: 14 38 | num_workers: 20 39 | wrap: true 40 | train: 41 | target: ldm.data.openimages.FullOpenImagesTrain 42 | params: 43 | size: 384 44 | crop_size: 256 45 | validation: 46 | target: ldm.data.openimages.FullOpenImagesValidation 47 | params: 48 | size: 384 49 | crop_size: 256 50 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/vq-f4-noattn/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | attn_type: none 11 | double_z: false 12 | z_channels: 3 13 | resolution: 256 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: 18 | - 1 19 | - 2 20 | - 4 21 | num_res_blocks: 2 22 | attn_resolutions: [] 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 11 30 | disc_weight: 0.75 31 | codebook_weight: 1.0 32 | 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 8 37 | num_workers: 12 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | crop_size: 256 43 | validation: 44 | target: ldm.data.openimages.FullOpenImagesValidation 45 | params: 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/vq-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | double_z: false 11 | z_channels: 3 12 | resolution: 256 13 | in_channels: 3 14 | out_ch: 3 15 | ch: 128 16 | ch_mult: 17 | - 1 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: [] 22 | dropout: 0.0 23 | lossconfig: 24 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 25 | params: 26 | disc_conditional: false 27 | disc_in_channels: 3 28 | disc_start: 0 29 | disc_weight: 0.75 30 | codebook_weight: 1.0 31 | 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 8 36 | num_workers: 16 37 | wrap: true 38 | train: 39 | target: ldm.data.openimages.FullOpenImagesTrain 40 | params: 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | crop_size: 256 46 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/vq-f8-n256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 4 6 | n_embed: 256 7 | monitor: val/rec_loss 8 | ddconfig: 9 | double_z: false 10 | z_channels: 4 11 | resolution: 256 12 | in_channels: 3 13 | out_ch: 3 14 | ch: 128 15 | ch_mult: 16 | - 1 17 | - 2 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: 22 | - 32 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 250001 30 | disc_weight: 0.75 31 | codebook_weight: 1.0 32 | 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 10 37 | num_workers: 20 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | size: 384 43 | crop_size: 256 44 | validation: 45 | target: ldm.data.openimages.FullOpenImagesValidation 46 | params: 47 | size: 384 48 | crop_size: 256 49 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/first_stage_models/vq-f8/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 4 6 | n_embed: 16384 7 | monitor: val/rec_loss 8 | ddconfig: 9 | double_z: false 10 | z_channels: 4 11 | resolution: 256 12 | in_channels: 3 13 | out_ch: 3 14 | ch: 128 15 | ch_mult: 16 | - 1 17 | - 2 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: 22 | - 32 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_num_layers: 2 30 | disc_start: 1 31 | disc_weight: 0.6 32 | codebook_weight: 1.0 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 10 37 | num_workers: 20 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | size: 384 43 | crop_size: 256 44 | validation: 45 | target: ldm.data.openimages.FullOpenImagesValidation 46 | params: 47 | size: 384 48 | crop_size: 256 49 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/ldm/celeba256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 2.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0195 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: class_label 12 | image_size: 64 13 | channels: 3 14 | cond_stage_trainable: false 15 | concat_mode: false 16 | monitor: val/loss 17 | unet_config: 18 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 19 | params: 20 | image_size: 64 21 | in_channels: 3 22 | out_channels: 3 23 | model_channels: 224 24 | attention_resolutions: 25 | - 8 26 | - 4 27 | - 2 28 | num_res_blocks: 2 29 | channel_mult: 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | num_head_channels: 32 35 | first_stage_config: 36 | target: ldm.models.autoencoder.VQModelInterface 37 | params: 38 | embed_dim: 3 39 | n_embed: 8192 40 | ddconfig: 41 | double_z: false 42 | z_channels: 3 43 | resolution: 256 44 | in_channels: 3 45 | out_ch: 3 46 | ch: 128 47 | ch_mult: 48 | - 1 49 | - 2 50 | - 4 51 | num_res_blocks: 2 52 | attn_resolutions: [] 53 | dropout: 0.0 54 | lossconfig: 55 | target: torch.nn.Identity 56 | cond_stage_config: __is_unconditional__ 57 | data: 58 | target: main.DataModuleFromConfig 59 | params: 60 | batch_size: 48 61 | num_workers: 5 62 | wrap: false 63 | train: 64 | target: ldm.data.faceshq.CelebAHQTrain 65 | params: 66 | size: 256 67 | validation: 68 | target: ldm.data.faceshq.CelebAHQValidation 69 | params: 70 | size: 256 71 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/ldm/ffhq256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 2.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0195 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: class_label 12 | image_size: 64 13 | channels: 3 14 | cond_stage_trainable: false 15 | concat_mode: false 16 | monitor: val/loss 17 | unet_config: 18 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 19 | params: 20 | image_size: 64 21 | in_channels: 3 22 | out_channels: 3 23 | model_channels: 224 24 | attention_resolutions: 25 | - 8 26 | - 4 27 | - 2 28 | num_res_blocks: 2 29 | channel_mult: 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | num_head_channels: 32 35 | first_stage_config: 36 | target: ldm.models.autoencoder.VQModelInterface 37 | params: 38 | embed_dim: 3 39 | n_embed: 8192 40 | ddconfig: 41 | double_z: false 42 | z_channels: 3 43 | resolution: 256 44 | in_channels: 3 45 | out_ch: 3 46 | ch: 128 47 | ch_mult: 48 | - 1 49 | - 2 50 | - 4 51 | num_res_blocks: 2 52 | attn_resolutions: [] 53 | dropout: 0.0 54 | lossconfig: 55 | target: torch.nn.Identity 56 | cond_stage_config: __is_unconditional__ 57 | data: 58 | target: main.DataModuleFromConfig 59 | params: 60 | batch_size: 42 61 | num_workers: 5 62 | wrap: false 63 | train: 64 | target: ldm.data.faceshq.FFHQTrain 65 | params: 66 | size: 256 67 | validation: 68 | target: ldm.data.faceshq.FFHQValidation 69 | params: 70 | size: 256 71 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/ldm/inpainting_big/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0205 7 | log_every_t: 100 8 | timesteps: 1000 9 | loss_type: l1 10 | first_stage_key: image 11 | cond_stage_key: masked_image 12 | image_size: 64 13 | channels: 3 14 | concat_mode: true 15 | monitor: val/loss 16 | scheduler_config: 17 | target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler 18 | params: 19 | verbosity_interval: 0 20 | warm_up_steps: 1000 21 | max_decay_steps: 50000 22 | lr_start: 0.001 23 | lr_max: 0.1 24 | lr_min: 0.0001 25 | unet_config: 26 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 27 | params: 28 | image_size: 64 29 | in_channels: 7 30 | out_channels: 3 31 | model_channels: 256 32 | attention_resolutions: 33 | - 8 34 | - 4 35 | - 2 36 | num_res_blocks: 2 37 | channel_mult: 38 | - 1 39 | - 2 40 | - 3 41 | - 4 42 | num_heads: 8 43 | resblock_updown: true 44 | first_stage_config: 45 | target: ldm.models.autoencoder.VQModelInterface 46 | params: 47 | embed_dim: 3 48 | n_embed: 8192 49 | monitor: val/rec_loss 50 | ddconfig: 51 | attn_type: none 52 | double_z: false 53 | z_channels: 3 54 | resolution: 256 55 | in_channels: 3 56 | out_ch: 3 57 | ch: 128 58 | ch_mult: 59 | - 1 60 | - 2 61 | - 4 62 | num_res_blocks: 2 63 | attn_resolutions: [] 64 | dropout: 0.0 65 | lossconfig: 66 | target: ldm.modules.losses.contperceptual.DummyLoss 67 | cond_stage_config: __is_first_stage__ 68 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/ldm/lsun_beds256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 2.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0195 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: class_label 12 | image_size: 64 13 | channels: 3 14 | cond_stage_trainable: false 15 | concat_mode: false 16 | monitor: val/loss 17 | unet_config: 18 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 19 | params: 20 | image_size: 64 21 | in_channels: 3 22 | out_channels: 3 23 | model_channels: 224 24 | attention_resolutions: 25 | - 8 26 | - 4 27 | - 2 28 | num_res_blocks: 2 29 | channel_mult: 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | num_head_channels: 32 35 | first_stage_config: 36 | target: ldm.models.autoencoder.VQModelInterface 37 | params: 38 | embed_dim: 3 39 | n_embed: 8192 40 | ddconfig: 41 | double_z: false 42 | z_channels: 3 43 | resolution: 256 44 | in_channels: 3 45 | out_ch: 3 46 | ch: 128 47 | ch_mult: 48 | - 1 49 | - 2 50 | - 4 51 | num_res_blocks: 2 52 | attn_resolutions: [] 53 | dropout: 0.0 54 | lossconfig: 55 | target: torch.nn.Identity 56 | cond_stage_config: __is_unconditional__ 57 | data: 58 | target: main.DataModuleFromConfig 59 | params: 60 | batch_size: 48 61 | num_workers: 5 62 | wrap: false 63 | train: 64 | target: ldm.data.lsun.LSUNBedroomsTrain 65 | params: 66 | size: 256 67 | validation: 68 | target: ldm.data.lsun.LSUNBedroomsValidation 69 | params: 70 | size: 256 71 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/models/ldm/semantic_synthesis256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-06 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0205 7 | log_every_t: 100 8 | timesteps: 1000 9 | loss_type: l1 10 | first_stage_key: image 11 | cond_stage_key: segmentation 12 | image_size: 64 13 | channels: 3 14 | concat_mode: true 15 | cond_stage_trainable: true 16 | unet_config: 17 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 18 | params: 19 | image_size: 64 20 | in_channels: 6 21 | out_channels: 3 22 | model_channels: 128 23 | attention_resolutions: 24 | - 32 25 | - 16 26 | - 8 27 | num_res_blocks: 2 28 | channel_mult: 29 | - 1 30 | - 4 31 | - 8 32 | num_heads: 8 33 | first_stage_config: 34 | target: ldm.models.autoencoder.VQModelInterface 35 | params: 36 | embed_dim: 3 37 | n_embed: 8192 38 | ddconfig: 39 | double_z: false 40 | z_channels: 3 41 | resolution: 256 42 | in_channels: 3 43 | out_ch: 3 44 | ch: 128 45 | ch_mult: 46 | - 1 47 | - 2 48 | - 4 49 | num_res_blocks: 2 50 | attn_resolutions: [] 51 | dropout: 0.0 52 | lossconfig: 53 | target: torch.nn.Identity 54 | cond_stage_config: 55 | target: ldm.modules.encoders.modules.SpatialRescaler 56 | params: 57 | n_stages: 2 58 | in_channels: 182 59 | out_channels: 3 60 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/scripts/download_first_stages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip 3 | wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip 4 | wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip 5 | wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip 6 | wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip 7 | wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip 8 | wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip 9 | wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip 10 | wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip 11 | 12 | 13 | 14 | cd models/first_stage_models/kl-f4 15 | unzip -o model.zip 16 | 17 | cd ../kl-f8 18 | unzip -o model.zip 19 | 20 | cd ../kl-f16 21 | unzip -o model.zip 22 | 23 | cd ../kl-f32 24 | unzip -o model.zip 25 | 26 | cd ../vq-f4 27 | unzip -o model.zip 28 | 29 | cd ../vq-f4-noattn 30 | unzip -o model.zip 31 | 32 | cd ../vq-f8 33 | unzip -o model.zip 34 | 35 | cd ../vq-f8-n256 36 | unzip -o model.zip 37 | 38 | cd ../vq-f16 39 | unzip -o model.zip 40 | 41 | cd ../.. -------------------------------------------------------------------------------- /VPD/stable-diffusion/scripts/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip 3 | wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip 4 | wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip 5 | wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip 6 | wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip 7 | wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip 8 | wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip 9 | wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip 10 | wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip 11 | wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip 12 | wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip 13 | 14 | 15 | 16 | cd models/ldm/celeba256 17 | unzip -o celeba-256.zip 18 | 19 | cd ../ffhq256 20 | unzip -o ffhq-256.zip 21 | 22 | cd ../lsun_churches256 23 | unzip -o lsun_churches-256.zip 24 | 25 | cd ../lsun_beds256 26 | unzip -o lsun_beds-256.zip 27 | 28 | cd ../text2img256 29 | unzip -o model.zip 30 | 31 | cd ../cin256 32 | unzip -o model.zip 33 | 34 | cd ../semantic_synthesis512 35 | unzip -o model.zip 36 | 37 | cd ../semantic_synthesis256 38 | unzip -o model.zip 39 | 40 | cd ../bsr_sr 41 | unzip -o model.zip 42 | 43 | cd ../layout2img-openimages256 44 | unzip -o model.zip 45 | 46 | cd ../inpainting_big 47 | unzip -o model.zip 48 | 49 | cd ../.. 50 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/scripts/tests/test_watermark.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import fire 3 | from imwatermark import WatermarkDecoder 4 | 5 | 6 | def testit(img_path): 7 | bgr = cv2.imread(img_path) 8 | decoder = WatermarkDecoder('bytes', 136) 9 | watermark = decoder.decode(bgr, 'dwtDct') 10 | try: 11 | dec = watermark.decode('utf-8') 12 | except: 13 | dec = "null" 14 | print(dec) 15 | 16 | 17 | if __name__ == "__main__": 18 | fire.Fire(testit) -------------------------------------------------------------------------------- /VPD/stable-diffusion/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='latent-diffusion', 5 | version='0.0.1', 6 | description='', 7 | packages=find_packages(), 8 | install_requires=[ 9 | 'torch', 10 | 'numpy', 11 | 'tqdm', 12 | ], 13 | ) -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 14 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 15 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 16 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 17 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 19 | OR OTHER DEALINGS IN THE SOFTWARE./ 20 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/coco_cond_stage.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: taming.models.vqgan.VQSegmentationModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | image_key: "segmentation" 8 | n_labels: 183 9 | ddconfig: 10 | double_z: false 11 | z_channels: 256 12 | resolution: 256 13 | in_channels: 183 14 | out_ch: 183 15 | ch: 128 16 | ch_mult: 17 | - 1 18 | - 1 19 | - 2 20 | - 2 21 | - 4 22 | num_res_blocks: 2 23 | attn_resolutions: 24 | - 16 25 | dropout: 0.0 26 | 27 | lossconfig: 28 | target: taming.modules.losses.segmentation.BCELossWithQuant 29 | params: 30 | codebook_weight: 1.0 31 | 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 12 36 | train: 37 | target: taming.data.coco.CocoImagesAndCaptionsTrain 38 | params: 39 | size: 296 40 | crop_size: 256 41 | onehot_segmentation: true 42 | use_stuffthing: true 43 | validation: 44 | target: taming.data.coco.CocoImagesAndCaptionsValidation 45 | params: 46 | size: 256 47 | crop_size: 256 48 | onehot_segmentation: true 49 | use_stuffthing: true 50 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/custom_vqgan.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: taming.models.vqgan.VQModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | ddconfig: 8 | double_z: False 9 | z_channels: 256 10 | resolution: 256 11 | in_channels: 3 12 | out_ch: 3 13 | ch: 128 14 | ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 15 | num_res_blocks: 2 16 | attn_resolutions: [16] 17 | dropout: 0.0 18 | 19 | lossconfig: 20 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 21 | params: 22 | disc_conditional: False 23 | disc_in_channels: 3 24 | disc_start: 10000 25 | disc_weight: 0.8 26 | codebook_weight: 1.0 27 | 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 5 32 | num_workers: 8 33 | train: 34 | target: taming.data.custom.CustomTrain 35 | params: 36 | training_images_list_file: some/training.txt 37 | size: 256 38 | validation: 39 | target: taming.data.custom.CustomTest 40 | params: 41 | test_images_list_file: some/test.txt 42 | size: 256 43 | 44 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/faceshq_transformer.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: taming.models.cond_transformer.Net2NetTransformer 4 | params: 5 | cond_stage_key: coord 6 | transformer_config: 7 | target: taming.modules.transformer.mingpt.GPT 8 | params: 9 | vocab_size: 1024 10 | block_size: 512 11 | n_layer: 24 12 | n_head: 16 13 | n_embd: 1024 14 | first_stage_config: 15 | target: taming.models.vqgan.VQModel 16 | params: 17 | ckpt_path: logs/2020-11-09T13-33-36_faceshq_vqgan/checkpoints/last.ckpt 18 | embed_dim: 256 19 | n_embed: 1024 20 | ddconfig: 21 | double_z: false 22 | z_channels: 256 23 | resolution: 256 24 | in_channels: 3 25 | out_ch: 3 26 | ch: 128 27 | ch_mult: 28 | - 1 29 | - 1 30 | - 2 31 | - 2 32 | - 4 33 | num_res_blocks: 2 34 | attn_resolutions: 35 | - 16 36 | dropout: 0.0 37 | lossconfig: 38 | target: taming.modules.losses.DummyLoss 39 | cond_stage_config: 40 | target: taming.modules.misc.coord.CoordStage 41 | params: 42 | n_embed: 1024 43 | down_factor: 16 44 | 45 | data: 46 | target: main.DataModuleFromConfig 47 | params: 48 | batch_size: 2 49 | num_workers: 8 50 | train: 51 | target: taming.data.faceshq.FacesHQTrain 52 | params: 53 | size: 256 54 | crop_size: 256 55 | coord: True 56 | validation: 57 | target: taming.data.faceshq.FacesHQValidation 58 | params: 59 | size: 256 60 | crop_size: 256 61 | coord: True 62 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/faceshq_vqgan.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: taming.models.vqgan.VQModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | ddconfig: 8 | double_z: False 9 | z_channels: 256 10 | resolution: 256 11 | in_channels: 3 12 | out_ch: 3 13 | ch: 128 14 | ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 15 | num_res_blocks: 2 16 | attn_resolutions: [16] 17 | dropout: 0.0 18 | 19 | lossconfig: 20 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 21 | params: 22 | disc_conditional: False 23 | disc_in_channels: 3 24 | disc_start: 30001 25 | disc_weight: 0.8 26 | codebook_weight: 1.0 27 | 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 3 32 | num_workers: 8 33 | train: 34 | target: taming.data.faceshq.FacesHQTrain 35 | params: 36 | size: 256 37 | crop_size: 256 38 | validation: 39 | target: taming.data.faceshq.FacesHQValidation 40 | params: 41 | size: 256 42 | crop_size: 256 43 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/imagenet_vqgan.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: taming.models.vqgan.VQModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | ddconfig: 8 | double_z: False 9 | z_channels: 256 10 | resolution: 256 11 | in_channels: 3 12 | out_ch: 3 13 | ch: 128 14 | ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 15 | num_res_blocks: 2 16 | attn_resolutions: [16] 17 | dropout: 0.0 18 | 19 | lossconfig: 20 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 21 | params: 22 | disc_conditional: False 23 | disc_in_channels: 3 24 | disc_start: 250001 25 | disc_weight: 0.8 26 | codebook_weight: 1.0 27 | 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 12 32 | num_workers: 24 33 | train: 34 | target: taming.data.imagenet.ImageNetTrain 35 | params: 36 | config: 37 | size: 256 38 | validation: 39 | target: taming.data.imagenet.ImageNetValidation 40 | params: 41 | config: 42 | size: 256 43 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/imagenetdepth_vqgan.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-6 3 | target: taming.models.vqgan.VQModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | image_key: depth 8 | ddconfig: 9 | double_z: False 10 | z_channels: 256 11 | resolution: 256 12 | in_channels: 1 13 | out_ch: 1 14 | ch: 128 15 | ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 16 | num_res_blocks: 2 17 | attn_resolutions: [16] 18 | dropout: 0.0 19 | 20 | lossconfig: 21 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 22 | params: 23 | disc_conditional: False 24 | disc_in_channels: 1 25 | disc_start: 50001 26 | disc_weight: 0.75 27 | codebook_weight: 1.0 28 | 29 | data: 30 | target: main.DataModuleFromConfig 31 | params: 32 | batch_size: 3 33 | num_workers: 8 34 | train: 35 | target: taming.data.imagenet.ImageNetTrainWithDepth 36 | params: 37 | size: 256 38 | validation: 39 | target: taming.data.imagenet.ImageNetValidationWithDepth 40 | params: 41 | size: 256 42 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/configs/sflckr_cond_stage.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: taming.models.vqgan.VQSegmentationModel 4 | params: 5 | embed_dim: 256 6 | n_embed: 1024 7 | image_key: "segmentation" 8 | n_labels: 182 9 | ddconfig: 10 | double_z: false 11 | z_channels: 256 12 | resolution: 256 13 | in_channels: 182 14 | out_ch: 182 15 | ch: 128 16 | ch_mult: 17 | - 1 18 | - 1 19 | - 2 20 | - 2 21 | - 4 22 | num_res_blocks: 2 23 | attn_resolutions: 24 | - 16 25 | dropout: 0.0 26 | 27 | lossconfig: 28 | target: taming.modules.losses.segmentation.BCELossWithQuant 29 | params: 30 | codebook_weight: 1.0 31 | 32 | data: 33 | target: cutlit.DataModuleFromConfig 34 | params: 35 | batch_size: 12 36 | train: 37 | target: taming.data.sflckr.Examples # adjust 38 | params: 39 | size: 256 40 | validation: 41 | target: taming.data.sflckr.Examples # adjust 42 | params: 43 | size: 256 44 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/environment.yaml: -------------------------------------------------------------------------------- 1 | name: taming 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.8.5 7 | - pip=20.3 8 | - cudatoolkit=10.2 9 | - pytorch=1.7.0 10 | - torchvision=0.8.1 11 | - numpy=1.19.2 12 | - pip: 13 | - albumentations==0.4.3 14 | - opencv-python==4.1.2.30 15 | - pudb==2019.2 16 | - imageio==2.9.0 17 | - imageio-ffmpeg==0.4.2 18 | - pytorch-lightning==1.0.8 19 | - omegaconf==2.0.0 20 | - test-tube>=0.7.5 21 | - streamlit>=0.73.1 22 | - einops==0.3.0 23 | - more-itertools>=8.0.0 24 | - transformers==4.3.1 25 | - -e . 26 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/scripts/extract_submodel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | inpath = sys.argv[1] 6 | outpath = sys.argv[2] 7 | submodel = "cond_stage_model" 8 | if len(sys.argv) > 3: 9 | submodel = sys.argv[3] 10 | 11 | print("Extracting {} from {} to {}.".format(submodel, inpath, outpath)) 12 | 13 | sd = torch.load(inpath, map_location="cpu") 14 | new_sd = {"state_dict": dict((k.split(".", 1)[-1],v) 15 | for k,v in sd["state_dict"].items() 16 | if k.startswith("cond_stage_model"))} 17 | torch.save(new_sd, outpath) 18 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='taming-transformers', 5 | version='0.0.1', 6 | description='Taming Transformers for High-Resolution Image Synthesis', 7 | packages=find_packages(), 8 | install_requires=[ 9 | 'torch', 10 | 'numpy', 11 | 'tqdm', 12 | ], 13 | ) 14 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/data/custom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import albumentations 4 | from torch.utils.data import Dataset 5 | 6 | from taming.data.base import ImagePaths, NumpyPaths, ConcatDatasetWithIndex 7 | 8 | 9 | class CustomBase(Dataset): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | self.data = None 13 | 14 | def __len__(self): 15 | return len(self.data) 16 | 17 | def __getitem__(self, i): 18 | example = self.data[i] 19 | return example 20 | 21 | 22 | 23 | class CustomTrain(CustomBase): 24 | def __init__(self, size, training_images_list_file): 25 | super().__init__() 26 | with open(training_images_list_file, "r") as f: 27 | paths = f.read().splitlines() 28 | self.data = ImagePaths(paths=paths, size=size, random_crop=False) 29 | 30 | 31 | class CustomTest(CustomBase): 32 | def __init__(self, size, test_images_list_file): 33 | super().__init__() 34 | with open(test_images_list_file, "r") as f: 35 | paths = f.read().splitlines() 36 | self.data = ImagePaths(paths=paths, size=size, random_crop=False) 37 | 38 | 39 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/data/helper_types.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple, Optional, NamedTuple, Union 2 | from PIL.Image import Image as pil_image 3 | from torch import Tensor 4 | 5 | try: 6 | from typing import Literal 7 | except ImportError: 8 | from typing_extensions import Literal 9 | 10 | Image = Union[Tensor, pil_image] 11 | BoundingBox = Tuple[float, float, float, float] # x0, y0, w, h 12 | CropMethodType = Literal['none', 'random', 'center', 'random-2d'] 13 | SplitType = Literal['train', 'validation', 'test'] 14 | 15 | 16 | class ImageDescription(NamedTuple): 17 | id: int 18 | file_name: str 19 | original_size: Tuple[int, int] # w, h 20 | url: Optional[str] = None 21 | license: Optional[int] = None 22 | coco_url: Optional[str] = None 23 | date_captured: Optional[str] = None 24 | flickr_url: Optional[str] = None 25 | flickr_id: Optional[str] = None 26 | coco_id: Optional[str] = None 27 | 28 | 29 | class Category(NamedTuple): 30 | id: str 31 | super_category: Optional[str] 32 | name: str 33 | 34 | 35 | class Annotation(NamedTuple): 36 | area: float 37 | image_id: str 38 | bbox: BoundingBox 39 | category_no: int 40 | category_id: str 41 | id: Optional[int] = None 42 | source: Optional[str] = None 43 | confidence: Optional[float] = None 44 | is_group_of: Optional[bool] = None 45 | is_truncated: Optional[bool] = None 46 | is_occluded: Optional[bool] = None 47 | is_depiction: Optional[bool] = None 48 | is_inside: Optional[bool] = None 49 | segmentation: Optional[Dict] = None 50 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LambdaWarmUpCosineScheduler: 5 | """ 6 | note: use with a base_lr of 1.0 7 | """ 8 | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): 9 | self.lr_warm_up_steps = warm_up_steps 10 | self.lr_start = lr_start 11 | self.lr_min = lr_min 12 | self.lr_max = lr_max 13 | self.lr_max_decay_steps = max_decay_steps 14 | self.last_lr = 0. 15 | self.verbosity_interval = verbosity_interval 16 | 17 | def schedule(self, n): 18 | if self.verbosity_interval > 0: 19 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") 20 | if n < self.lr_warm_up_steps: 21 | lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start 22 | self.last_lr = lr 23 | return lr 24 | else: 25 | t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) 26 | t = min(t, 1.0) 27 | lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 28 | 1 + np.cos(t * np.pi)) 29 | self.last_lr = lr 30 | return lr 31 | 32 | def __call__(self, n): 33 | return self.schedule(n) 34 | 35 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/models/dummy_cond_stage.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | 3 | 4 | class DummyCondStage: 5 | def __init__(self, conditional_key): 6 | self.conditional_key = conditional_key 7 | self.train = None 8 | 9 | def eval(self): 10 | return self 11 | 12 | @staticmethod 13 | def encode(c: Tensor): 14 | return c, None, (None, None, c) 15 | 16 | @staticmethod 17 | def decode(c: Tensor): 18 | return c 19 | 20 | @staticmethod 21 | def to_rgb(c: Tensor): 22 | return c 23 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from taming.modules.losses.vqperceptual import DummyLoss 2 | 3 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/modules/losses/segmentation.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class BCELoss(nn.Module): 6 | def forward(self, prediction, target): 7 | loss = F.binary_cross_entropy_with_logits(prediction,target) 8 | return loss, {} 9 | 10 | 11 | class BCELossWithQuant(nn.Module): 12 | def __init__(self, codebook_weight=1.): 13 | super().__init__() 14 | self.codebook_weight = codebook_weight 15 | 16 | def forward(self, qloss, target, prediction, split): 17 | bce_loss = F.binary_cross_entropy_with_logits(prediction,target) 18 | loss = bce_loss + self.codebook_weight*qloss 19 | return loss, {"{}/total_loss".format(split): loss.clone().detach().mean(), 20 | "{}/bce_loss".format(split): bce_loss.detach().mean(), 21 | "{}/quant_loss".format(split): qloss.detach().mean() 22 | } 23 | -------------------------------------------------------------------------------- /VPD/stable-diffusion/src/taming-transformers/taming/modules/misc/coord.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class CoordStage(object): 4 | def __init__(self, n_embed, down_factor): 5 | self.n_embed = n_embed 6 | self.down_factor = down_factor 7 | 8 | def eval(self): 9 | return self 10 | 11 | def encode(self, c): 12 | """fake vqmodel interface""" 13 | assert 0.0 <= c.min() and c.max() <= 1.0 14 | b,ch,h,w = c.shape 15 | assert ch == 1 16 | 17 | c = torch.nn.functional.interpolate(c, scale_factor=1/self.down_factor, 18 | mode="area") 19 | c = c.clamp(0.0, 1.0) 20 | c = self.n_embed*c 21 | c_quant = c.round() 22 | c_ind = c_quant.to(dtype=torch.long) 23 | 24 | info = None, None, c_ind 25 | return c_quant, None, info 26 | 27 | def decode(self, c): 28 | c = c/self.n_embed 29 | c = torch.nn.functional.interpolate(c, scale_factor=self.down_factor, 30 | mode="nearest") 31 | return c 32 | -------------------------------------------------------------------------------- /VPD/vpd/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import UNetWrapper, TextAdapter -------------------------------------------------------------------------------- /ZoeDepth/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ZoeDepth/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on paper ZoeDepth 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, tensorboardX, timm, mmcv, opencv-python 4 | 5 | 2. 6 | 7 | ``` 8 | python demo.py -i split_files.txt -o outputs/ 9 | ``` 10 | 11 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 12 | -------------------------------------------------------------------------------- /ZoeDepth/assets/zoedepth-teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/ZoeDepth/assets/zoedepth-teaser.png -------------------------------------------------------------------------------- /ZoeDepth/environment.yml: -------------------------------------------------------------------------------- 1 | name: zoe 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - conda-forge 6 | dependencies: 7 | - cuda=11.7.1 8 | - h5py=3.7.0 9 | - hdf5=1.12.2 10 | - matplotlib=3.6.2 11 | - matplotlib-base=3.6.2 12 | - numpy=1.24.1 13 | - opencv=4.6.0 14 | - pip=22.3.1 15 | - python=3.9.7 16 | - pytorch=1.13.1 17 | - pytorch-cuda=11.7 18 | - pytorch-mutex=1.0 19 | - scipy=1.10.0 20 | - torchaudio=0.13.1 21 | - torchvision=0.14.1 22 | - pip: 23 | - huggingface-hub==0.11.1 24 | - timm==0.6.12 25 | - tqdm==4.64.1 26 | - wandb==0.13.9 27 | -------------------------------------------------------------------------------- /ZoeDepth/sanity_hub.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import torch 26 | import numpy as np 27 | from torchvision.transforms import ToTensor 28 | from PIL import Image 29 | from zoedepth.utils.misc import get_image_from_url, colorize 30 | 31 | from zoedepth.models.builder import build_model 32 | from zoedepth.utils.config import get_config 33 | from pprint import pprint 34 | 35 | 36 | 37 | # Trigger reload of MiDaS 38 | torch.hub.help("intel-isl/MiDaS", "DPT_BEiT_L_384", force_reload=True) 39 | 40 | 41 | model = torch.hub.load(".", "ZoeD_K", source="local", pretrained=True) 42 | model = torch.hub.load(".", "ZoeD_NK", source="local", pretrained=True) 43 | model = torch.hub.load(".", "ZoeD_N", source="local", pretrained=True) 44 | -------------------------------------------------------------------------------- /ZoeDepth/ui/ui_requirements.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | trimesh==3.9.42 -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/data/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/base_models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/zoedepth/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_v1 import ZoeDepth 26 | 27 | all_versions = { 28 | "v1": ZoeDepth, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/zoedepth/config_zoedepth.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ZoeDepth", 4 | "version_name": "v1", 5 | "n_bins": 64, 6 | "bin_embedding_dim": 128, 7 | "bin_centers_type": "softplus", 8 | "n_attractors":[16, 8, 4, 1], 9 | "attractor_alpha": 1000, 10 | "attractor_gamma": 2, 11 | "attractor_kind" : "mean", 12 | "attractor_type" : "inv", 13 | "midas_model_type" : "DPT_BEiT_L_384", 14 | "min_temp": 0.0212, 15 | "max_temp": 50.0, 16 | "output_distribution": "logbinomial", 17 | "memory_efficient": true, 18 | "inverse_midas": false, 19 | "img_size": [384, 512] 20 | }, 21 | 22 | "train": { 23 | "train_midas": true, 24 | "use_pretrained_midas": true, 25 | "trainer": "zoedepth", 26 | "epochs": 5, 27 | "bs": 16, 28 | "optim_kwargs": {"lr": 0.000161, "wd": 0.01}, 29 | "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, 30 | "same_lr": false, 31 | "w_si": 1, 32 | "w_domain": 0.2, 33 | "w_reg": 0, 34 | "w_grad": 0, 35 | "avoid_boundary": false, 36 | "random_crop": false, 37 | "input_width": 640, 38 | "input_height": 480, 39 | "midas_lr_factor": 1, 40 | "encoder_lr_factor":10, 41 | "pos_enc_lr_factor":10, 42 | "freeze_midas_bn": true 43 | 44 | }, 45 | 46 | "infer":{ 47 | "train_midas": false, 48 | "use_pretrained_midas": false, 49 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt", 50 | "force_keep_ar": true 51 | }, 52 | 53 | "eval":{ 54 | "train_midas": false, 55 | "use_pretrained_midas": false, 56 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt" 57 | } 58 | } -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/zoedepth/config_zoedepth_kitti.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "bin_centers_type": "normed", 4 | "img_size": [384, 768] 5 | }, 6 | 7 | "train": { 8 | }, 9 | 10 | "infer":{ 11 | "train_midas": false, 12 | "use_pretrained_midas": false, 13 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt", 14 | "force_keep_ar": true 15 | }, 16 | 17 | "eval":{ 18 | "train_midas": false, 19 | "use_pretrained_midas": false, 20 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" 21 | } 22 | } -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/models/zoedepth_nk/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_nk_v1 import ZoeDepthNK 26 | 27 | all_versions = { 28 | "v1": ZoeDepthNK, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /ZoeDepth/zoedepth/utils/arg_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def infer_type(x): # hacky way to infer type from string args 4 | if not isinstance(x, str): 5 | return x 6 | 7 | try: 8 | x = int(x) 9 | return x 10 | except ValueError: 11 | pass 12 | 13 | try: 14 | x = float(x) 15 | return x 16 | except ValueError: 17 | pass 18 | 19 | return x 20 | 21 | 22 | def parse_unknown(unknown_args): 23 | clean = [] 24 | for a in unknown_args: 25 | if "=" in a: 26 | k, v = a.split("=") 27 | clean.extend([k, v]) 28 | else: 29 | clean.append(a) 30 | 31 | keys = clean[::2] 32 | values = clean[1::2] 33 | return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)} 34 | -------------------------------------------------------------------------------- /bts/README.md: -------------------------------------------------------------------------------- 1 | # Benchamrk on From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation 2 | 3 | 1. Download InSpaceType eval set. Install [torch and torchivsion](https://pytorch.org/get-started/previous-versions/) and packages: matplotlib, tqdm, pandas, opencv-python, tensorboardX 4 | 5 | 2. Download pretrained model 'bts_nyu_v2_pytorch_densenet161.zip' from [Official Link](https://cogaplex-bts.s3.ap-northeast-2.amazonaws.com/bts_nyu_v2_pytorch_densenet161.zip) and extract under 'model' 6 | 7 | 3. 8 | cd pytorch 9 | 10 | ``` 11 | python bts_test.py --dataset nyu --filenames_file ../train_test_inputs/split_files.txt --checkpoint_path models/bts_nyu_v2_pytorch_densenet161/model --max_depth 10 --encoder densenet161_bts --model_name bts_nyu_v2_pytorch_densenet161 12 | ``` 13 | 14 | The command generates report files for hierarchy (H0-H2). *-all means overall H0-H2 means level of hierarchy. H1_xx means scene space type number. See [space_type_def.yml](https://github.com/DepthComputation/InSpaceType_Benchmark/blob/main/space_type_def.yml) for space type number definition. 15 | 16 | -------------------------------------------------------------------------------- /bts/pytorch/run_bts_live_3d.sh: -------------------------------------------------------------------------------- 1 | python3 bts_live_3d.py --model_name bts_nyu_v2_pytorch_densenet161 --encoder densenet161_bts --checkpoint_path ./models/bts_nyu_v2_pytorch_densenet161/model --max_depth 10 --input_height 480 --input_width 640 2 | -------------------------------------------------------------------------------- /bts/utils/download_from_gdrive.py: -------------------------------------------------------------------------------- 1 | # Source: https://stackoverflow.com/a/39225039 2 | 3 | import requests 4 | 5 | 6 | def download_file_from_google_drive(id, destination): 7 | def get_confirm_token(response): 8 | for key, value in response.cookies.items(): 9 | if key.startswith('download_warning'): 10 | return value 11 | 12 | return None 13 | 14 | def save_response_content(response, destination): 15 | CHUNK_SIZE = 32768 16 | 17 | with open(destination, "wb") as f: 18 | for chunk in response.iter_content(CHUNK_SIZE): 19 | if chunk: # filter out keep-alive new chunks 20 | f.write(chunk) 21 | 22 | URL = "https://docs.google.com/uc?export=download" 23 | 24 | session = requests.Session() 25 | 26 | response = session.get(URL, params = { 'id' : id }, stream = True) 27 | token = get_confirm_token(response) 28 | 29 | if token: 30 | params = { 'id' : id, 'confirm' : token } 31 | response = session.get(URL, params = params, stream = True) 32 | 33 | save_response_content(response, destination) 34 | 35 | 36 | if __name__ == "__main__": 37 | import sys 38 | if len(sys.argv) is not 3: 39 | print("Usage: python google_drive.py drive_file_id destination_file_path") 40 | else: 41 | # TAKE ID FROM SHAREABLE LINK 42 | file_id = sys.argv[1] 43 | # DESTINATION FILE ON YOUR DISK 44 | destination = sys.argv[2] 45 | download_file_from_google_drive(file_id, destination) 46 | -------------------------------------------------------------------------------- /bts/utils/splits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/bts/utils/splits.mat -------------------------------------------------------------------------------- /pics/dataset-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/dataset-1.png -------------------------------------------------------------------------------- /pics/dataset-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/dataset-2.png -------------------------------------------------------------------------------- /pics/fitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/fitting.png -------------------------------------------------------------------------------- /pics/group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/group.png -------------------------------------------------------------------------------- /pics/heirarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/heirarchy.png -------------------------------------------------------------------------------- /pics/mitigation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/mitigation.png -------------------------------------------------------------------------------- /pics/overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/overall.png -------------------------------------------------------------------------------- /pics/type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DepthComputation/InSpaceType_Benchmark/4ab9760dbdf9153fc2845afd584762509f9fe0e0/pics/type.png -------------------------------------------------------------------------------- /space_type_def.yml: -------------------------------------------------------------------------------- 1 | H0: 2 | 1: Household Space 3 | 2: Workspace 4 | 3: Campus 5 | 4: Functional Space 6 | 7 | H1: 8 | 1: Private 9 | 2: Office 10 | 3: Hallway 11 | 4: Lounge 12 | 5: Meeting 13 | 6: Large 14 | 7: Classroom 15 | 8: Library 16 | 9: Kitchen 17 | 10: Playroom 18 | 11: Living 19 | 12: Bathroom 20 | 21 | H2: 22 | 1: Hotel Room 23 | 2: Western-style Bedroom 24 | 3: Eastern-style Bedroom 25 | 4: Storage Room 26 | 5: Dressing Room 27 | 6: Entrance 28 | 7: Private Chamber 29 | 8: Lab Space 30 | 9: Mail Room 31 | 10: Eastern-style Workspace 32 | 11: Narrow Hallway 33 | 12: Wider Hallway 34 | 13: Lounge 35 | 14: Meeting Room 36 | 15: Auditorium 37 | 16: Reception Desk 38 | 17: Banquet Room 39 | 18: Auditorium Entryway 40 | 19: Classroom 41 | 20: Study Space 42 | 21: Bookshelf 43 | 22: Asian-style Kitchen 44 | 23: Playroom 45 | 24: Asian-style Living Room 46 | 25: American-Style Living Room 47 | 26: Bathroom --------------------------------------------------------------------------------