├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── checkpoints └── model-weights-here.txt ├── datasets └── youtube_vos.py ├── inference.py ├── inputs ├── dilated-masks │ └── car-turn.png ├── edited-first-frames │ ├── bear-elephant.png │ └── car-turn-inpainted.png ├── frames │ ├── bear │ │ ├── 00000.jpg │ │ ├── 00001.jpg │ │ ├── 00002.jpg │ │ ├── 00003.jpg │ │ ├── 00004.jpg │ │ ├── 00005.jpg │ │ ├── 00006.jpg │ │ ├── 00007.jpg │ │ ├── 00008.jpg │ │ ├── 00009.jpg │ │ ├── 00010.jpg │ │ ├── 00011.jpg │ │ ├── 00012.jpg │ │ ├── 00013.jpg │ │ ├── 00014.jpg │ │ └── 00015.jpg │ └── car-turn │ │ ├── 00000.jpg │ │ ├── 00001.jpg │ │ ├── 00002.jpg │ │ ├── 00003.jpg │ │ ├── 00004.jpg │ │ ├── 00005.jpg │ │ ├── 00006.jpg │ │ ├── 00007.jpg │ │ ├── 00008.jpg │ │ ├── 00009.jpg │ │ ├── 00010.jpg │ │ ├── 00011.jpg │ │ ├── 00012.jpg │ │ ├── 00013.jpg │ │ ├── 00014.jpg │ │ └── 00015.jpg ├── hand-drawn-sketches │ └── bear-elephant-sketch.png ├── masks │ ├── bear.png │ └── car-turn.png └── reference-images │ └── raccoon.jpg ├── install_conda.sh ├── install_pip.sh ├── models ├── __init__.py ├── anydoor │ ├── .gitignore │ ├── LICENSE.txt │ ├── assets │ │ └── Figures │ │ │ ├── Teaser.png │ │ │ ├── gradio.png │ │ │ └── tryon.png │ ├── cldm │ │ ├── cldm.py │ │ ├── ddim_hacked.py │ │ ├── hack.py │ │ ├── logger.py │ │ └── model.py │ ├── cog.yaml │ ├── configs │ │ ├── anydoor.yaml │ │ ├── datasets.yaml │ │ ├── demo.yaml │ │ └── inference.yaml │ ├── datasets │ │ ├── Preprocess │ │ │ ├── mvimagenet.txt │ │ │ └── uvo_process.py │ │ ├── base.py │ │ ├── data_utils.py │ │ ├── dreambooth.py │ │ ├── dresscode.py │ │ ├── fashiontryon.py │ │ ├── lvis.py │ │ ├── mose.py │ │ ├── mvimagenet.py │ │ ├── saliency_modular.py │ │ ├── sam.py │ │ ├── uvo.py │ │ ├── uvo_val.py │ │ ├── vipseg.py │ │ ├── vitonhd.py │ │ ├── ytb_vis.py │ │ └── ytb_vos.py │ ├── dinov2 │ │ ├── .github │ │ │ └── workflows │ │ │ │ └── lint.yaml │ │ ├── .gitignore │ │ ├── CODE_OF_CONDUCT.md │ │ ├── CONTRIBUTING.md │ │ ├── LICENSE │ │ ├── MODEL_CARD.md │ │ ├── README.md │ │ ├── conda.yaml │ │ ├── dinov2 │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── eval │ │ │ │ │ ├── vitb14_pretrain.yaml │ │ │ │ │ ├── vitg14_pretrain.yaml │ │ │ │ │ ├── vitl14_pretrain.yaml │ │ │ │ │ └── vits14_pretrain.yaml │ │ │ │ ├── ssl_default_config.yaml │ │ │ │ └── train │ │ │ │ │ ├── vitg14.yaml │ │ │ │ │ ├── vitl14.yaml │ │ │ │ │ └── vitl16_short.yaml │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ ├── adapters.py │ │ │ │ ├── augmentations.py │ │ │ │ ├── collate.py │ │ │ │ ├── datasets │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── decoders.py │ │ │ │ │ ├── extended.py │ │ │ │ │ ├── image_net.py │ │ │ │ │ └── image_net_22k.py │ │ │ │ ├── loaders.py │ │ │ │ ├── masking.py │ │ │ │ ├── samplers.py │ │ │ │ └── transforms.py │ │ │ ├── distributed │ │ │ │ └── __init__.py │ │ │ ├── eval │ │ │ │ ├── __init__.py │ │ │ │ ├── knn.py │ │ │ │ ├── linear.py │ │ │ │ ├── log_regression.py │ │ │ │ ├── metrics.py │ │ │ │ ├── setup.py │ │ │ │ └── utils.py │ │ │ ├── fsdp │ │ │ │ └── __init__.py │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── attention.py │ │ │ │ ├── block.py │ │ │ │ ├── dino_head.py │ │ │ │ ├── drop_path.py │ │ │ │ ├── layer_scale.py │ │ │ │ ├── mlp.py │ │ │ │ ├── patch_embed.py │ │ │ │ └── swiglu_ffn.py │ │ │ ├── logging │ │ │ │ ├── __init__.py │ │ │ │ └── helpers.py │ │ │ ├── loss │ │ │ │ ├── __init__.py │ │ │ │ ├── dino_clstoken_loss.py │ │ │ │ ├── ibot_patch_loss.py │ │ │ │ └── koleo_loss.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── vision_transformer.py │ │ │ ├── run │ │ │ │ ├── __init__.py │ │ │ │ ├── eval │ │ │ │ │ ├── knn.py │ │ │ │ │ ├── linear.py │ │ │ │ │ └── log_regression.py │ │ │ │ ├── submit.py │ │ │ │ └── train │ │ │ │ │ └── train.py │ │ │ ├── train │ │ │ │ ├── __init__.py │ │ │ │ ├── ssl_meta_arch.py │ │ │ │ └── train.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── cluster.py │ │ │ │ ├── config.py │ │ │ │ ├── dtype.py │ │ │ │ ├── param_groups.py │ │ │ │ └── utils.py │ │ ├── hubconf.py │ │ ├── pyproject.toml │ │ ├── requirements-dev.txt │ │ ├── requirements.txt │ │ ├── scripts │ │ │ └── lint.sh │ │ ├── setup.cfg │ │ └── setup.py │ ├── environment.yaml │ ├── examples │ │ ├── Gradio │ │ │ ├── BG │ │ │ │ ├── 00.png │ │ │ │ ├── 01.png │ │ │ │ ├── 02.png │ │ │ │ ├── 03.png │ │ │ │ ├── 04.jpg │ │ │ │ ├── 04.png │ │ │ │ ├── 06.png │ │ │ │ ├── 07.png │ │ │ │ ├── 08.jpg │ │ │ │ ├── 13.jpg │ │ │ │ ├── 17.jpg │ │ │ │ └── 22.png │ │ │ └── FG │ │ │ │ ├── 00.jpg │ │ │ │ ├── 01.jpg │ │ │ │ ├── 04.jpg │ │ │ │ ├── 06.jpg │ │ │ │ ├── 07.png │ │ │ │ ├── 09.jpg │ │ │ │ ├── 18.png │ │ │ │ ├── 22.jpg │ │ │ │ ├── 25.png │ │ │ │ ├── 28.png │ │ │ │ ├── 33.png │ │ │ │ ├── 36.jpg │ │ │ │ ├── 39.jpg │ │ │ │ ├── 43.jpg │ │ │ │ ├── 44.jpg │ │ │ │ └── 50.jpg │ │ └── TestDreamBooth │ │ │ ├── BG │ │ │ ├── 000000047948_GT.png │ │ │ ├── 000000047948_mask.png │ │ │ ├── 000000309203_GT.png │ │ │ └── 000000309203_mask.png │ │ │ ├── FG │ │ │ ├── 00.png │ │ │ ├── 01.png │ │ │ ├── 02.png │ │ │ └── 03.png │ │ │ └── GEN │ │ │ └── gen_res.png │ ├── iseg │ │ ├── coarse_mask_refine.pth │ │ └── coarse_mask_refine_util.py │ ├── ldm │ │ ├── data │ │ │ ├── __init__.py │ │ │ └── util.py │ │ ├── models │ │ │ ├── autoencoder.py │ │ │ └── diffusion │ │ │ │ ├── __init__.py │ │ │ │ ├── ddim.py │ │ │ │ ├── ddpm.py │ │ │ │ ├── dpm_solver │ │ │ │ ├── __init__.py │ │ │ │ ├── dpm_solver.py │ │ │ │ └── sampler.py │ │ │ │ ├── plms.py │ │ │ │ └── sampling_util.py │ │ ├── modules │ │ │ ├── attention.py │ │ │ ├── diffusionmodules │ │ │ │ ├── __init__.py │ │ │ │ ├── model.py │ │ │ │ ├── openaimodel.py │ │ │ │ ├── upscaling.py │ │ │ │ └── util.py │ │ │ ├── distributions │ │ │ │ ├── __init__.py │ │ │ │ └── distributions.py │ │ │ ├── ema.py │ │ │ ├── encoders │ │ │ │ ├── __init__.py │ │ │ │ └── modules.py │ │ │ ├── image_degradation │ │ │ │ ├── __init__.py │ │ │ │ ├── bsrgan.py │ │ │ │ ├── bsrgan_light.py │ │ │ │ ├── utils │ │ │ │ │ └── test.png │ │ │ │ └── utils_image.py │ │ │ └── midas │ │ │ │ ├── __init__.py │ │ │ │ ├── api.py │ │ │ │ ├── midas │ │ │ │ ├── __init__.py │ │ │ │ ├── base_model.py │ │ │ │ ├── blocks.py │ │ │ │ ├── dpt_depth.py │ │ │ │ ├── midas_net.py │ │ │ │ ├── midas_net_custom.py │ │ │ │ ├── transforms.py │ │ │ │ └── vit.py │ │ │ │ └── utils.py │ │ └── util.py │ ├── predict.py │ ├── readme.md │ ├── requirements.txt │ ├── run_dataset_debug.py │ ├── run_gradio_demo.py │ ├── run_inference.py │ ├── run_train_anydoor.py │ ├── scripts │ │ ├── convert_weight.sh │ │ ├── inference.sh │ │ └── train.sh │ └── tool_add_control_sd21.py ├── canny │ ├── __pycache__ │ │ ├── canny_filter.cpython-310.pyc │ │ ├── filter.cpython-310.pyc │ │ ├── gaussian.cpython-310.pyc │ │ ├── kernels.cpython-310.pyc │ │ └── sobel.cpython-310.pyc │ ├── canny_filter.py │ ├── filter.py │ ├── gaussian.py │ ├── kernels.py │ └── sobel.py ├── controlnet_inpaint │ └── pipeline.py ├── ctrl_adapter │ ├── __init__.py │ ├── adapter_spatial_temporal.py │ ├── controlnet.py │ ├── ctrl_adapter.py │ └── resnet_block_2d.py ├── depth_completion_net │ ├── deformation_net.py │ ├── deformconv.py │ └── rfc_net.py ├── i2vgenxl │ ├── i2vgenxl_ctrl_adapter_pipeline.py │ └── i2vgenxl_unet.py ├── midas │ ├── backbones │ │ ├── __pycache__ │ │ │ ├── beit.cpython-310.pyc │ │ │ ├── levit.cpython-310.pyc │ │ │ ├── swin.cpython-310.pyc │ │ │ ├── swin2.cpython-310.pyc │ │ │ ├── swin_common.cpython-310.pyc │ │ │ ├── utils.cpython-310.pyc │ │ │ └── vit.cpython-310.pyc │ │ ├── beit.py │ │ ├── levit.py │ │ ├── next_vit.py │ │ ├── swin.py │ │ ├── swin2.py │ │ ├── swin_common.py │ │ ├── utils.py │ │ └── vit.py │ ├── base_model.py │ ├── blocks.py │ ├── dpt_depth.py │ ├── midas.py │ ├── midas_net.py │ ├── midas_net_custom.py │ ├── model_loader.py │ └── transforms.py ├── raft │ ├── __init__.py │ ├── corr.py │ ├── extractor.py │ ├── raft.py │ ├── update.py │ └── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── flow_viz.cpython-310.pyc │ │ ├── frame_utils.cpython-310.pyc │ │ └── utils.cpython-310.pyc │ │ ├── augmentor.py │ │ ├── flow_viz.py │ │ ├── flow_viz_pt.py │ │ ├── frame_utils.py │ │ └── utils.py └── u2net │ ├── __init__.py │ └── u2net.py ├── runners ├── __init__.py ├── anydoor_inference_runner.py ├── completion_net_inference_runner.py ├── completion_net_train_runner.py ├── controlnet_inpaint_inference_runner.py ├── i2vgenxl_ctrl_adapter_inference_runner.py ├── instructpix2pix_inference_runner.py ├── iterative_warping │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── get_averaged_depths.cpython-310.pyc │ │ ├── run_flow_extraction.cpython-310.pyc │ │ ├── run_torch_average_flow_warping.cpython-310.pyc │ │ ├── run_warp_with_averaged_flow.cpython-310.pyc │ │ └── warp_utils.cpython-310.pyc │ ├── get_averaged_depths.py │ ├── get_editing_region.py │ ├── raft │ │ ├── LICENSE │ │ ├── README.md │ │ ├── alt_cuda_corr │ │ │ ├── correlation.cpp │ │ │ ├── correlation_kernel.cu │ │ │ └── setup.py │ │ ├── chairs_split.txt │ │ ├── core │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ ├── corr.cpython-310.pyc │ │ │ │ ├── extractor.cpython-310.pyc │ │ │ │ ├── raft.cpython-310.pyc │ │ │ │ └── update.cpython-310.pyc │ │ │ ├── corr.py │ │ │ ├── datasets.py │ │ │ ├── extractor.py │ │ │ ├── raft.py │ │ │ ├── update.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ ├── flow_viz.cpython-310.pyc │ │ │ │ └── utils.cpython-310.pyc │ │ │ │ ├── augmentor.py │ │ │ │ ├── flow_viz.py │ │ │ │ ├── frame_utils.py │ │ │ │ └── utils.py │ │ ├── demo.py │ │ ├── download_models.sh │ │ ├── evaluate.py │ │ ├── extract-flow-from-frames.py │ │ ├── extract-flow-from-two-images.py │ │ ├── train.py │ │ ├── train_mixed.sh │ │ └── train_standard.sh │ ├── run_extract_images_depths.py │ ├── run_flow_extraction.py │ ├── run_numpy_average_flow_warping.py │ ├── run_torch_average_flow_warping.py │ ├── run_warp.py │ ├── run_warp_with_averaged_flow.py │ └── warp_utils.py ├── iterative_warping_runner.py ├── midas_depth_estimation_runner.py ├── paint_by_example_inference_runner.py ├── stable_diffusion_inpaint_inference_runner.py └── u2net_saliency_detection_runner.py ├── scripts ├── extract_youtube_vos_depths.py ├── extract_youtube_vos_shapes.py ├── inference_controlnet_inpaint.py └── run_dilate_mask.py ├── train_completion_net.py └── utils ├── __pycache__ ├── file_client.cpython-310.pyc ├── flow_utils.cpython-310.pyc ├── loss_utils.cpython-310.pyc ├── lr_scheduler_utils.cpython-310.pyc ├── mask_utils.cpython-310.pyc └── utils.cpython-310.pyc ├── file_client.py ├── flow_utils.py ├── loss_utils.py ├── lr_scheduler_utils.py ├── mask_utils.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 USTC-liuchang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /checkpoints/model-weights-here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/checkpoints/model-weights-here.txt -------------------------------------------------------------------------------- /inputs/dilated-masks/car-turn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/dilated-masks/car-turn.png -------------------------------------------------------------------------------- /inputs/edited-first-frames/bear-elephant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/edited-first-frames/bear-elephant.png -------------------------------------------------------------------------------- /inputs/edited-first-frames/car-turn-inpainted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/edited-first-frames/car-turn-inpainted.png -------------------------------------------------------------------------------- /inputs/frames/bear/00000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00000.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00001.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00002.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00003.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00004.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00005.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00006.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00007.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00008.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00008.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00009.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00010.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00011.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00012.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00013.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00013.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00014.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00014.jpg -------------------------------------------------------------------------------- /inputs/frames/bear/00015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00015.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00000.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00001.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00002.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00003.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00004.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00005.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00006.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00007.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00008.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00008.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00009.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00010.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00011.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00012.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00013.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00013.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00014.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00014.jpg -------------------------------------------------------------------------------- /inputs/frames/car-turn/00015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00015.jpg -------------------------------------------------------------------------------- /inputs/hand-drawn-sketches/bear-elephant-sketch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/hand-drawn-sketches/bear-elephant-sketch.png -------------------------------------------------------------------------------- /inputs/masks/bear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/masks/bear.png -------------------------------------------------------------------------------- /inputs/masks/car-turn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/masks/car-turn.png -------------------------------------------------------------------------------- /inputs/reference-images/raccoon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/reference-images/raccoon.jpg -------------------------------------------------------------------------------- /install_conda.sh: -------------------------------------------------------------------------------- 1 | conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia 2 | conda install opencv-python pillow gradio transformers einops scipy matplotlib omegaconf albumentations accelerate huggingface-hub==0.23.5 diffusers==0.27.2 timm==0.6.7 -------------------------------------------------------------------------------- /install_pip.sh: -------------------------------------------------------------------------------- 1 | pip install opencv-python 2 | pip install pillow 3 | pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 4 | pip install diffusers==0.27.2 5 | pip install huggingface-hub==0.23.5 6 | pip install transformers 7 | pip install einops 8 | pip install scipy 9 | pip install timm==0.6.7 10 | pip install matplotlib 11 | pip install omegaconf 12 | pip install albumentations 13 | pip install accelerate 14 | pip install gradio 15 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # blank 2 | 3 | -------------------------------------------------------------------------------- /models/anydoor/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | **/.DS_Store 3 | training/ 4 | lightning_logs/ 5 | image_log/ 6 | 7 | #*.pth 8 | *.pt 9 | *.ckpt 10 | *.safetensors 11 | 12 | gradio_pose2image_private.py 13 | gradio_canny2image_private.py 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | pip-wheel-metadata/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | db.sqlite3-journal 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # IPython 95 | profile_default/ 96 | ipython_config.py 97 | 98 | # pyenv 99 | .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 109 | __pypackages__/ 110 | 111 | # Celery stuff 112 | celerybeat-schedule 113 | celerybeat.pid 114 | 115 | # SageMath parsed files 116 | *.sage.py 117 | 118 | # Environments 119 | .env 120 | .venv 121 | env/ 122 | venv/ 123 | ENV/ 124 | env.bak/ 125 | venv.bak/ 126 | 127 | # Spyder project settings 128 | .spyderproject 129 | .spyproject 130 | 131 | # Rope project settings 132 | .ropeproject 133 | 134 | # mkdocs documentation 135 | /site 136 | 137 | # mypy 138 | .mypy_cache/ 139 | .dmypy.json 140 | dmypy.json 141 | 142 | # Pyre type checker 143 | .pyre/ 144 | -------------------------------------------------------------------------------- /models/anydoor/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 DAMO Vision Intelligence Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/anydoor/assets/Figures/Teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/Teaser.png -------------------------------------------------------------------------------- /models/anydoor/assets/Figures/gradio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/gradio.png -------------------------------------------------------------------------------- /models/anydoor/assets/Figures/tryon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/tryon.png -------------------------------------------------------------------------------- /models/anydoor/cldm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from omegaconf import OmegaConf 5 | from models.anydoor.ldm.util import instantiate_from_config 6 | 7 | 8 | def get_state_dict(d): 9 | return d.get('state_dict', d) 10 | 11 | 12 | def load_state_dict(ckpt_path, location='cpu'): 13 | _, extension = os.path.splitext(ckpt_path) 14 | if extension.lower() == ".safetensors": 15 | import safetensors.torch 16 | state_dict = safetensors.torch.load_file(ckpt_path, device=location) 17 | else: 18 | state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location))) 19 | state_dict = get_state_dict(state_dict) 20 | print(f'Loaded state_dict from [{ckpt_path}]') 21 | return state_dict 22 | 23 | 24 | def create_model(config_path): 25 | config = OmegaConf.load(config_path) 26 | model = instantiate_from_config(config.model).cpu() 27 | print(f'Loaded model config from [{config_path}]') 28 | return model 29 | -------------------------------------------------------------------------------- /models/anydoor/cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | build: 4 | gpu: true 5 | system_packages: 6 | - "mesa-common-dev" 7 | python_version: "3.8.5" 8 | python_packages: 9 | - "albumentations==1.3.0" 10 | - "einops==0.3.0" 11 | - "fvcore==0.1.5.post20221221" 12 | - "gradio==3.39.0" 13 | - "numpy==1.23.1" 14 | - "omegaconf==2.1.1" 15 | - "open_clip_torch==2.17.1" 16 | - "opencv_python==4.7.0.72" 17 | - "opencv_python_headless==4.7.0.72" 18 | - "Pillow==9.4.0" 19 | - "pytorch_lightning==1.5.0" 20 | - "safetensors==0.2.7" 21 | - "scipy==1.9.1" 22 | - "setuptools==66.0.0" 23 | - "share==1.0.4" 24 | - "submitit==1.5.1" 25 | - "timm==0.6.12" 26 | - "torch==2.0.0" 27 | - "torchmetrics==0.6.0" 28 | - "tqdm==4.65.0" 29 | - "transformers==4.19.2" 30 | - "xformers==0.0.18" 31 | 32 | run: 33 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.3.1/pget" && chmod +x /usr/local/bin/pget 34 | 35 | # predict.py defines how predictions are run on your model 36 | predict: "predict.py:Predictor" -------------------------------------------------------------------------------- /models/anydoor/configs/anydoor.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | target: models.anydoor.cldm.cldm.ControlLDM 3 | params: 4 | linear_start: 0.00085 5 | linear_end: 0.0120 6 | num_timesteps_cond: 1 7 | log_every_t: 200 8 | timesteps: 1000 9 | first_stage_key: "jpg" 10 | cond_stage_key: "ref" 11 | control_key: "hint" 12 | image_size: 64 13 | channels: 4 14 | cond_stage_trainable: false 15 | conditioning_key: crossattn 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | use_ema: False 19 | only_mid_control: False 20 | 21 | control_stage_config: 22 | target: models.anydoor.cldm.cldm.ControlNet 23 | params: 24 | use_checkpoint: True 25 | image_size: 32 # unused 26 | in_channels: 4 27 | hint_channels: 4 #3 28 | model_channels: 320 29 | attention_resolutions: [ 4, 2, 1 ] 30 | num_res_blocks: 2 31 | channel_mult: [ 1, 2, 4, 4 ] 32 | num_head_channels: 64 # need to fix for flash-attn 33 | use_spatial_transformer: True 34 | use_linear_in_transformer: True 35 | transformer_depth: 1 36 | context_dim: 1024 37 | legacy: False 38 | 39 | unet_config: 40 | target: models.anydoor.cldm.cldm.ControlledUnetModel 41 | params: 42 | use_checkpoint: True 43 | image_size: 32 # unused 44 | in_channels: 4 45 | out_channels: 4 46 | model_channels: 320 47 | attention_resolutions: [ 4, 2, 1 ] 48 | num_res_blocks: 2 49 | channel_mult: [ 1, 2, 4, 4 ] 50 | num_head_channels: 64 # need to fix for flash-attn 51 | use_spatial_transformer: True 52 | use_linear_in_transformer: True 53 | transformer_depth: 1 54 | context_dim: 1024 55 | legacy: False 56 | 57 | first_stage_config: 58 | target: models.anydoor.ldm.models.autoencoder.AutoencoderKL 59 | params: 60 | embed_dim: 4 61 | monitor: val/rec_loss 62 | ddconfig: 63 | #attn_type: "vanilla-xformers" 64 | double_z: true 65 | z_channels: 4 66 | resolution: 256 67 | in_channels: 3 68 | out_ch: 3 69 | ch: 128 70 | ch_mult: 71 | - 1 72 | - 2 73 | - 4 74 | - 4 75 | num_res_blocks: 2 76 | attn_resolutions: [] 77 | dropout: 0.0 78 | lossconfig: 79 | target: torch.nn.Identity 80 | 81 | cond_stage_config: 82 | target: models.anydoor.ldm.modules.encoders.modules.FrozenDinoV2Encoder 83 | weight: /path/to/dinov2_vitg14_pretrain.pth 84 | 85 | 86 | -------------------------------------------------------------------------------- /models/anydoor/configs/datasets.yaml: -------------------------------------------------------------------------------- 1 | Train: 2 | YoutubeVOS: 3 | image_dir: path/YTBVOS/train/JPEGImages/ 4 | anno: path/YTBVOS/train/Annotations 5 | meta: path/YTBVOS/train/meta.json 6 | 7 | YoutubeVIS: 8 | image_dir: path/youtubevis/train/JPEGImages/ 9 | anno: path/youtubevis/train/Annotations/ 10 | meta: path/youtubevis/train/meta.json 11 | 12 | VIPSeg: 13 | image_dir: path/VIPSeg/VIPSeg_720P/images/ 14 | anno: path/VIPSeg/VIPSeg_720P/panomasksRGB/ 15 | 16 | UVO: 17 | train: 18 | image_dir: path/UVO/uvo_frames_sparse 19 | video_json: path/UVO/UVO_sparse_train_video_with_interpolation.json 20 | image_json: path/UVO/UVO_sparse_train_video_with_interpolation_reorg.json 21 | val: 22 | image_dir: path/UVO/uvo_frames_sparse 23 | video_json: path/UVO/VideoSparseSet/UVO_sparse_val_video_with_interpolation.json 24 | image_json: path/UVO/VideoSparseSet/UVO_sparse_val_video_interpolation_reorg.json 25 | 26 | Mose: 27 | image_dir: path/MOSE/train/JPEGImages/ 28 | anno: path/MOSE/train/Annotations/ 29 | 30 | MVImageNet: 31 | txt: ./datasets/Preprocess/mvimagenet.txt 32 | image_dir: /mnt/workspace/xizhi/data/MVImgNet/ 33 | 34 | VitonHD: 35 | image_dir: path/TryOn/VitonHD/train/cloth/ 36 | 37 | Dresscode: 38 | image_dir: /mnt/workspace/xizhi/data/dresscode/DressCode/upper_body/label_maps/ 39 | 40 | FashionTryon: 41 | image_dir: path/TryOn/FashionTryOn/train 42 | 43 | Lvis: 44 | image_dir: path/COCO/train2017 45 | json_path: path/lvis_v1/lvis_v1_train.json 46 | 47 | SAM: 48 | sub1: path/SAM/0000 49 | sub2: path/SAM/0001 50 | sub3: path/SAM/0002 51 | sub4: path/SAM/0004 52 | 53 | Saliency: 54 | MSRA_root: path/Saliency/MSRA10K_Imgs_GT/ 55 | TR_root: path/Saliency/DUTS-TR/DUTS-TR-Image/ 56 | TE_root: path/Saliency/DUTS-TE/DUTS-TE-Image/ 57 | HFlickr_root: path/HFlickr/masks/ 58 | 59 | Test: 60 | DreamBooth: 61 | fg_dir: path/DreamBooth/AnyDoor_DreamBooth 62 | bg_dir: path/DreamBooth/v1_800 63 | 64 | VitonHDTest: 65 | image_dir: path/TryOn/VitonHD/test/cloth 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /models/anydoor/configs/demo.yaml: -------------------------------------------------------------------------------- 1 | pretrained_model: path/epoch=1-step=8687.ckpt 2 | config_file: configs/anydoor.yaml 3 | save_memory: False 4 | use_interactive_seg: True 5 | -------------------------------------------------------------------------------- /models/anydoor/configs/inference.yaml: -------------------------------------------------------------------------------- 1 | pretrained_model: /Users/liuchang/Desktop/Workspaces/checkpoints/anydoor/epoch=1-step=8687.ckpt 2 | config_file: /Users/liuchang/Desktop/Workspaces/code/shape-consistent-video-editing/iterative-warping/models/anydoor/configs/anydoor.yaml 3 | save_memory: False 4 | -------------------------------------------------------------------------------- /models/anydoor/datasets/Preprocess/uvo_process.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import json 3 | import os 4 | from pycocotools import mask as mask_utils 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | json_path = 'path/UVO/UVO_sparse_train_video_with_interpolation.json' 9 | output_path = "path/UVO/UVO_sparse_train_video_with_interpolation_reorg.json" 10 | 11 | with open(json_path, 'r') as fcc_file: 12 | data = json.load(fcc_file) 13 | 14 | info = data['info'] 15 | videos = data['videos'] 16 | print(len(videos)) 17 | 18 | 19 | uvo_dict = {} 20 | for video in tqdm(videos): 21 | vid = video['id'] 22 | file_names = video['file_names'] 23 | uvo_dict[vid] = file_names 24 | 25 | 26 | with open(output_path,"w") as f: 27 | json.dump(uvo_dict,f) 28 | print('finish') 29 | 30 | -------------------------------------------------------------------------------- /models/anydoor/datasets/dreambooth.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | 11 | class DreamBoothDataset(BaseDataset): 12 | def __init__(self, fg_dir, bg_dir): 13 | self.bg_dir = bg_dir 14 | bg_data = os.listdir(self.bg_dir) 15 | self.bg_data = [i for i in bg_data if 'mask' in i] 16 | self.image_dir = fg_dir 17 | self.data = os.listdir(self.image_dir) 18 | self.size = (512,512) 19 | self.clip_size = (224,224) 20 | ''' 21 | Dynamic: 22 | 0: Static View, High Quality 23 | 1: Multi-view, Low Quality 24 | 2: Multi-view, High Quality 25 | ''' 26 | self.dynamic = 1 27 | 28 | def __len__(self): 29 | return len(self.data) 30 | 31 | def __getitem__(self, idx): 32 | idx = np.random.randint(0, len(self.data)-1) 33 | item = self.get_sample(idx) 34 | return item 35 | 36 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 37 | pass_flag = True 38 | H,W = image.shape[0], image.shape[1] 39 | H,W = H * ratio, W * ratio 40 | y1,y2,x1,x2 = yyxx 41 | h,w = y2-y1,x2-x1 42 | if mode == 'max': 43 | if h > H and w > W: 44 | pass_flag = False 45 | elif mode == 'min': 46 | if h < H and w < W: 47 | pass_flag = False 48 | return pass_flag 49 | 50 | def get_alpha_mask(self, mask_path): 51 | image = cv2.imread( mask_path, cv2.IMREAD_UNCHANGED) 52 | mask = (image[:,:,-1] > 128).astype(np.uint8) 53 | return mask 54 | 55 | def get_sample(self, idx): 56 | dir_name = self.data[idx] 57 | dir_path = os.path.join(self.image_dir, dir_name) 58 | images = os.listdir(dir_path) 59 | image_name = [i for i in images if '.png' in i][0] 60 | image_path = os.path.join(dir_path, image_name) 61 | 62 | image = cv2.imread( image_path, cv2.IMREAD_UNCHANGED) 63 | mask = (image[:,:,-1] > 128).astype(np.uint8) 64 | image = image[:,:,:-1] 65 | 66 | image = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2RGB) 67 | ref_image = image 68 | ref_mask = mask 69 | ref_image, ref_mask = expand_image_mask(image, mask, ratio=1.4) 70 | bg_idx = np.random.randint(0, len(self.bg_data)-1) 71 | 72 | tar_mask_name = self.bg_data[bg_idx] 73 | tar_mask_path = os.path.join(self.bg_dir, tar_mask_name) 74 | tar_image_path = tar_mask_path.replace('_mask','_GT') 75 | 76 | tar_image = cv2.imread(tar_image_path).astype(np.uint8) 77 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 78 | tar_mask = (cv2.imread(tar_mask_path) > 128).astype(np.uint8)[:,:,0] 79 | 80 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 81 | sampled_time_steps = self.sample_timestep() 82 | item_with_collage['time_steps'] = sampled_time_steps 83 | return item_with_collage 84 | 85 | -------------------------------------------------------------------------------- /models/anydoor/datasets/dresscode.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | import albumentations as A 11 | 12 | class DresscodeDataset(BaseDataset): 13 | def __init__(self, image_dir): 14 | self.image_root = image_dir 15 | self.data = os.listdir(self.image_root) 16 | self.size = (512,512) 17 | self.clip_size = (224,224) 18 | self.dynamic = 2 19 | 20 | def __len__(self): 21 | return 20000 22 | 23 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 24 | pass_flag = True 25 | H,W = image.shape[0], image.shape[1] 26 | H,W = H * ratio, W * ratio 27 | y1,y2,x1,x2 = yyxx 28 | h,w = y2-y1,x2-x1 29 | if mode == 'max': 30 | if h > H and w > W: 31 | pass_flag = False 32 | elif mode == 'min': 33 | if h < H and w < W: 34 | pass_flag = False 35 | return pass_flag 36 | 37 | def get_sample(self, idx): 38 | tar_mask_path = os.path.join(self.image_root, self.data[idx]) 39 | tar_image_path = tar_mask_path.replace('label_maps/','images/').replace('_4.png','_0.jpg') 40 | ref_image_path = tar_mask_path.replace('label_maps/','images/').replace('_4.png','_1.jpg') 41 | 42 | # Read Image and Mask 43 | ref_image = cv2.imread(ref_image_path) 44 | ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB) 45 | 46 | tar_image = cv2.imread(tar_image_path) 47 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 48 | 49 | ref_mask = (ref_image < 240).astype(np.uint8)[:,:,0] 50 | 51 | 52 | tar_mask = Image.open(tar_mask_path ).convert('P') 53 | tar_mask= np.array(tar_mask) 54 | tar_mask = tar_mask == 4 55 | 56 | 57 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0) 58 | sampled_time_steps = self.sample_timestep() 59 | item_with_collage['time_steps'] = sampled_time_steps 60 | return item_with_collage 61 | 62 | -------------------------------------------------------------------------------- /models/anydoor/datasets/fashiontryon.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | import albumentations as A 11 | 12 | class FashionTryonDataset(BaseDataset): 13 | def __init__(self, image_dir): 14 | self.image_root = image_dir 15 | self.data =os.listdir(self.image_root) 16 | self.size = (512,512) 17 | self.clip_size = (224,224) 18 | self.dynamic = 2 19 | 20 | def __len__(self): 21 | return 5000 22 | 23 | def aug_data(self, image): 24 | transform = A.Compose([ 25 | A.RandomBrightnessContrast(p=0.5), 26 | ]) 27 | transformed = transform(image=image.astype(np.uint8)) 28 | transformed_image = transformed["image"] 29 | return transformed_image 30 | 31 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 32 | pass_flag = True 33 | H,W = image.shape[0], image.shape[1] 34 | H,W = H * ratio, W * ratio 35 | y1,y2,x1,x2 = yyxx 36 | h,w = y2-y1,x2-x1 37 | if mode == 'max': 38 | if h > H and w > W: 39 | pass_flag = False 40 | elif mode == 'min': 41 | if h < H and w < W: 42 | pass_flag = False 43 | return pass_flag 44 | 45 | def get_sample(self, idx): 46 | cloth_dir = os.path.join(self.image_root, self.data[idx]) 47 | ref_image_path = os.path.join(cloth_dir, 'target.jpg') 48 | 49 | ref_image = cv2.imread(ref_image_path) 50 | ref_image = cv2.cvtColor(ref_image.copy(), cv2.COLOR_BGR2RGB) 51 | 52 | ref_mask_path = os.path.join(cloth_dir,'mask.jpg') 53 | ref_mask = cv2.imread(ref_mask_path)[:,:,0] > 128 54 | 55 | target_dirs = [i for i in os.listdir(cloth_dir ) if '.jpg' not in i] 56 | target_dir_name = np.random.choice(target_dirs) 57 | 58 | target_image_path = os.path.join(cloth_dir, target_dir_name + '.jpg') 59 | target_image= cv2.imread(target_image_path) 60 | tar_image = cv2.cvtColor(target_image.copy(), cv2.COLOR_BGR2RGB) 61 | 62 | target_mask_path = os.path.join(cloth_dir, target_dir_name, 'segment.png') 63 | tar_mask= cv2.imread(target_mask_path)[:,:,0] 64 | target_mask = tar_mask == 7 65 | kernel = np.ones((3, 3), dtype=np.uint8) 66 | tar_mask = cv2.erode(target_mask.astype(np.uint8), kernel, iterations=3) 67 | 68 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0) 69 | sampled_time_steps = self.sample_timestep() 70 | item_with_collage['time_steps'] = sampled_time_steps 71 | return item_with_collage 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /models/anydoor/datasets/lvis.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | from pycocotools import mask as mask_utils 11 | from lvis import LVIS 12 | 13 | class LvisDataset(BaseDataset): 14 | def __init__(self, image_dir, json_path): 15 | self.image_dir = image_dir 16 | self.json_path = json_path 17 | lvis_api = LVIS(json_path) 18 | img_ids = sorted(lvis_api.imgs.keys()) 19 | imgs = lvis_api.load_imgs(img_ids) 20 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 21 | self.data = imgs 22 | self.annos = anns 23 | self.lvis_api = lvis_api 24 | self.size = (512,512) 25 | self.clip_size = (224,224) 26 | self.dynamic = 0 27 | 28 | def register_subset(self, path): 29 | data = os.listdir(path) 30 | data = [ os.path.join(path, i) for i in data if '.json' in i] 31 | self.data = self.data + data 32 | 33 | def get_sample(self, idx): 34 | # ==== get pairs ===== 35 | image_name = self.data[idx]['coco_url'].split('/')[-1] 36 | image_path = os.path.join(self.image_dir, image_name) 37 | image = cv2.imread(image_path) 38 | ref_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 39 | 40 | anno = self.annos[idx] 41 | obj_ids = [] 42 | for i in range(len(anno)): 43 | obj = anno[i] 44 | area = obj['area'] 45 | if area > 3600: 46 | obj_ids.append(i) 47 | assert len(anno) > 0 48 | obj_id = np.random.choice(obj_ids) 49 | anno = anno[obj_id] 50 | ref_mask = self.lvis_api.ann_to_mask(anno) 51 | 52 | tar_image, tar_mask = ref_image.copy(), ref_mask.copy() 53 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 54 | sampled_time_steps = self.sample_timestep() 55 | item_with_collage['time_steps'] = sampled_time_steps 56 | return item_with_collage 57 | 58 | def __len__(self): 59 | return 20000 60 | 61 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 62 | pass_flag = True 63 | H,W = image.shape[0], image.shape[1] 64 | H,W = H * ratio, W * ratio 65 | y1,y2,x1,x2 = yyxx 66 | h,w = y2-y1,x2-x1 67 | if mode == 'max': 68 | if h > H or w > W: 69 | pass_flag = False 70 | elif mode == 'min': 71 | if h < H or w < W: 72 | pass_flag = False 73 | return pass_flag 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /models/anydoor/datasets/mvimagenet.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | 11 | class MVImageNetDataset(BaseDataset): 12 | def __init__(self, txt, image_dir): 13 | with open(txt,"r") as f: 14 | data = f.read().split('\n')[:-1] 15 | self.image_dir = image_dir 16 | self.data = data 17 | self.size = (512,512) 18 | self.clip_size = (224,224) 19 | self.dynamic = 2 20 | 21 | def __len__(self): 22 | return 40000 23 | 24 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 25 | pass_flag = True 26 | H,W = image.shape[0], image.shape[1] 27 | H,W = H * ratio, W * ratio 28 | y1,y2,x1,x2 = yyxx 29 | h,w = y2-y1,x2-x1 30 | if mode == 'max': 31 | if h > H and w > W: 32 | pass_flag = False 33 | elif mode == 'min': 34 | if h < H and w < W: 35 | pass_flag = False 36 | return pass_flag 37 | 38 | def get_alpha_mask(self, mask_path): 39 | image = cv2.imread( mask_path, cv2.IMREAD_UNCHANGED) 40 | mask = (image[:,:,-1] > 128).astype(np.uint8) 41 | return mask 42 | 43 | def get_sample(self, idx): 44 | object_dir = self.data[idx].replace('MVDir/', self.image_dir) 45 | frames = os.listdir(object_dir) 46 | frames = [ i for i in frames if '.png' in i] 47 | 48 | # Sampling frames 49 | min_interval = len(frames) // 8 50 | start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval) 51 | end_frame_index = start_frame_index + np.random.randint(min_interval, len(frames) - start_frame_index ) 52 | end_frame_index = min(end_frame_index, len(frames) - 1) 53 | 54 | # Get image path 55 | ref_mask_name = frames[start_frame_index] 56 | tar_mask_name = frames[end_frame_index] 57 | 58 | ref_image_name = ref_mask_name.split('_')[0] + '.jpg' 59 | tar_image_name = tar_mask_name.split('_')[0] + '.jpg' 60 | 61 | ref_mask_path = os.path.join(object_dir, ref_mask_name) 62 | tar_mask_path = os.path.join(object_dir, tar_mask_name) 63 | ref_image_path = os.path.join(object_dir, ref_image_name) 64 | tar_image_path = os.path.join(object_dir, tar_image_name) 65 | 66 | # Read Image and Mask 67 | ref_image = cv2.imread(ref_image_path).astype(np.uint8) 68 | ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB) 69 | 70 | tar_image = cv2.imread(tar_image_path).astype(np.uint8) 71 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 72 | 73 | ref_mask = self.get_alpha_mask(ref_mask_path) 74 | tar_mask = self.get_alpha_mask(tar_mask_path) 75 | 76 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 77 | sampled_time_steps = self.sample_timestep() 78 | item_with_collage['time_steps'] = sampled_time_steps 79 | 80 | return item_with_collage 81 | 82 | -------------------------------------------------------------------------------- /models/anydoor/datasets/sam.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | from pycocotools import mask as mask_utils 11 | 12 | class SAMDataset(BaseDataset): 13 | def __init__(self, sub1, sub2, sub3, sub4): 14 | image_mask_dict = {} 15 | self.data = [] 16 | self.register_subset(sub1) 17 | self.register_subset(sub2) 18 | self.register_subset(sub3) 19 | self.register_subset(sub4) 20 | self.size = (512,512) 21 | self.clip_size = (224,224) 22 | self.dynamic = 0 23 | 24 | def register_subset(self, path): 25 | data = os.listdir(path) 26 | data = [ os.path.join(path, i) for i in data if '.json' in i] 27 | self.data = self.data + data 28 | 29 | def get_sample(self, idx): 30 | # ==== get pairs ===== 31 | json_path = self.data[idx] 32 | image_path = json_path.replace('.json', '.jpg') 33 | 34 | with open(json_path, 'r') as json_file: 35 | data = json.load(json_file) 36 | annotation = data['annotations'] 37 | 38 | valid_ids = [] 39 | for i in range(len(annotation)): 40 | area = annotation[i]['area'] 41 | if area > 100 * 100 * 5: 42 | valid_ids.append(i) 43 | 44 | chosen_id = np.random.choice(valid_ids) 45 | mask = mask_utils.decode(annotation[chosen_id]["segmentation"] ) 46 | # ====================== 47 | 48 | image = cv2.imread(image_path) 49 | ref_image = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2RGB) 50 | tar_image = ref_image 51 | 52 | ref_mask = mask 53 | tar_mask = mask 54 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 55 | sampled_time_steps = self.sample_timestep() 56 | item_with_collage['time_steps'] = sampled_time_steps 57 | return item_with_collage 58 | 59 | def __len__(self): 60 | return 20000 61 | 62 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 63 | pass_flag = True 64 | H,W = image.shape[0], image.shape[1] 65 | H,W = H * ratio, W * ratio 66 | y1,y2,x1,x2 = yyxx 67 | h,w = y2-y1,x2-x1 68 | if mode == 'max': 69 | if h > H or w > W: 70 | pass_flag = False 71 | elif mode == 'min': 72 | if h < H or w < W: 73 | pass_flag = False 74 | return pass_flag 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /models/anydoor/datasets/uvo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | from pycocotools import mask as mask_utils 11 | 12 | class UVODataset(BaseDataset): 13 | def __init__(self, image_dir, video_json, image_json): 14 | json_path = video_json 15 | with open(json_path, 'r') as fcc_file: 16 | data = json.load(fcc_file) 17 | 18 | image_json_path = image_json 19 | with open(image_json_path , 'r') as image_file: 20 | video_dict = json.load(image_file) 21 | 22 | self.image_root = image_dir 23 | self.data = data['annotations'] 24 | self.video_dict = video_dict 25 | self.size = (512,512) 26 | self.clip_size = (224,224) 27 | self.dynamic = 1 28 | 29 | def __len__(self): 30 | return 25000 31 | 32 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 33 | pass_flag = True 34 | H,W = image.shape[0], image.shape[1] 35 | H,W = H * ratio, W * ratio 36 | y1,y2,x1,x2 = yyxx 37 | h,w = y2-y1,x2-x1 38 | if mode == 'max': 39 | if h > H and w > W: 40 | pass_flag = False 41 | elif mode == 'min': 42 | if h < H and w < W: 43 | pass_flag = False 44 | return pass_flag 45 | 46 | def get_sample(self, idx): 47 | ins_anno = self.data[idx] 48 | video_id = str(ins_anno['video_id']) 49 | video_names = self.video_dict[video_id] 50 | masks = ins_anno['segmentations'] 51 | frames = video_names 52 | 53 | # Sampling frames 54 | min_interval = len(frames) // 10 55 | start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval) 56 | end_frame_index = start_frame_index + np.random.randint(min_interval, len(frames) - start_frame_index ) 57 | end_frame_index = min(end_frame_index, len(frames) - 1) 58 | 59 | # Get image path 60 | ref_image_name = frames[start_frame_index] 61 | tar_image_name = frames[end_frame_index] 62 | ref_image_path = os.path.join(self.image_root, ref_image_name) 63 | tar_image_path = os.path.join(self.image_root, tar_image_name) 64 | 65 | # Read Image and Mask 66 | ref_image = cv2.imread(ref_image_path) 67 | ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB) 68 | 69 | tar_image = cv2.imread(tar_image_path) 70 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 71 | 72 | ref_mask = mask_utils.decode(masks[start_frame_index]) 73 | tar_mask = mask_utils.decode(masks[end_frame_index]) 74 | 75 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 76 | sampled_time_steps = self.sample_timestep() 77 | item_with_collage['time_steps'] = sampled_time_steps 78 | return item_with_collage 79 | 80 | -------------------------------------------------------------------------------- /models/anydoor/datasets/uvo_val.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | from pycocotools import mask as mask_utils 11 | 12 | class UVOValDataset(BaseDataset): 13 | def __init__(self, image_dir, video_json, image_json): 14 | json_path = video_json 15 | with open(json_path, 'r') as fcc_file: 16 | data = json.load(fcc_file) 17 | image_json_path = image_json 18 | with open(image_json_path , 'r') as image_file: 19 | video_dict = json.load(image_file) 20 | self.image_root = image_dir 21 | self.data = data['annotations'] 22 | self.video_dict = video_dict 23 | self.size = (512,512) 24 | self.clip_size = (224,224) 25 | self.dynamic = 1 26 | 27 | def __len__(self): 28 | return 8000 29 | 30 | def __getitem__(self, idx): 31 | while(1): 32 | idx = np.random.randint(0, len(self.data)-1) 33 | try: 34 | item = self.get_sample(idx) 35 | return item 36 | except: 37 | idx = np.random.randint(0, len(self.data)-1) 38 | 39 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 40 | pass_flag = True 41 | H,W = image.shape[0], image.shape[1] 42 | H,W = H * ratio, W * ratio 43 | y1,y2,x1,x2 = yyxx 44 | h,w = y2-y1,x2-x1 45 | if mode == 'max': 46 | if h > H and w > W: 47 | pass_flag = False 48 | elif mode == 'min': 49 | if h < H and w < W: 50 | pass_flag = False 51 | return pass_flag 52 | 53 | def get_sample(self, idx): 54 | ins_anno = self.data[idx] 55 | video_id = str(ins_anno['video_id']) 56 | 57 | video_names = self.video_dict[video_id] 58 | masks = ins_anno['segmentations'] 59 | frames = video_names 60 | 61 | # Sampling frames 62 | min_interval = len(frames) // 5 63 | start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval) 64 | end_frame_index = start_frame_index + np.random.randint(min_interval, len(frames) - start_frame_index ) 65 | end_frame_index = min(end_frame_index, len(frames) - 1) 66 | 67 | # Get image path 68 | ref_image_name = frames[start_frame_index] 69 | tar_image_name = frames[end_frame_index] 70 | ref_image_path = os.path.join(self.image_root, ref_image_name) 71 | tar_image_path = os.path.join(self.image_root, tar_image_name) 72 | 73 | # Read Image and Mask 74 | ref_image = cv2.imread(ref_image_path) 75 | ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB) 76 | 77 | tar_image = cv2.imread(tar_image_path) 78 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 79 | 80 | ref_mask = mask_utils.decode(masks[start_frame_index]) 81 | tar_mask = mask_utils.decode(masks[end_frame_index]) 82 | 83 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask) 84 | sampled_time_steps = self.sample_timestep() 85 | item_with_collage['time_steps'] = sampled_time_steps 86 | return item_with_collage 87 | 88 | -------------------------------------------------------------------------------- /models/anydoor/datasets/vitonhd.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy as np 4 | import os 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import cv2 8 | from .data_utils import * 9 | from .base import BaseDataset 10 | import albumentations as A 11 | 12 | class VitonHDDataset(BaseDataset): 13 | def __init__(self, image_dir): 14 | self.image_root = image_dir 15 | self.data = os.listdir(self.image_root) 16 | self.size = (512,512) 17 | self.clip_size = (224,224) 18 | self.dynamic = 2 19 | 20 | def __len__(self): 21 | return 20000 22 | 23 | def check_region_size(self, image, yyxx, ratio, mode = 'max'): 24 | pass_flag = True 25 | H,W = image.shape[0], image.shape[1] 26 | H,W = H * ratio, W * ratio 27 | y1,y2,x1,x2 = yyxx 28 | h,w = y2-y1,x2-x1 29 | if mode == 'max': 30 | if h > H and w > W: 31 | pass_flag = False 32 | elif mode == 'min': 33 | if h < H and w < W: 34 | pass_flag = False 35 | return pass_flag 36 | 37 | def get_sample(self, idx): 38 | 39 | ref_image_path = os.path.join(self.image_root, self.data[idx]) 40 | tar_image_path = ref_image_path.replace('/cloth/', '/image/') 41 | ref_mask_path = ref_image_path.replace('/cloth/','/cloth-mask/') 42 | tar_mask_path = ref_image_path.replace('/cloth/', '/image-parse-v3/').replace('.jpg','.png') 43 | 44 | # Read Image and Mask 45 | ref_image = cv2.imread(ref_image_path) 46 | ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB) 47 | 48 | tar_image = cv2.imread(tar_image_path) 49 | tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB) 50 | 51 | ref_mask = (cv2.imread(ref_mask_path) > 128).astype(np.uint8)[:,:,0] 52 | 53 | tar_mask = Image.open(tar_mask_path ).convert('P') 54 | tar_mask= np.array(tar_mask) 55 | tar_mask = tar_mask == 5 56 | 57 | item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0) 58 | sampled_time_steps = self.sample_timestep() 59 | item_with_collage['time_steps'] = sampled_time_steps 60 | return item_with_collage 61 | 62 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - master 10 | - 'gh/**' 11 | 12 | jobs: 13 | run-linters: 14 | name: Run linters 15 | runs-on: ubuntu-20.04 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v3 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: 3.9 24 | cache: 'pip' 25 | cache-dependency-path: '**/requirements*.txt' 26 | - name: Install Python (development) dependencies 27 | run: | 28 | pip install -r requirements-dev.txt 29 | - name: Run flake8 30 | run: | 31 | flake8 32 | - name: Run black 33 | if: always() 34 | run: | 35 | black --check dinov2 36 | - name: Run pylint 37 | if: always() 38 | run: | 39 | pylint --exit-zero dinov2 40 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | *.egg-info/ 4 | **/__pycache__/ 5 | 6 | **/.ipynb_checkpoints 7 | **/.ipynb_checkpoints/** 8 | 9 | **/notebooks 10 | 11 | *.swp 12 | 13 | .vscode/ 14 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DINOv2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to DINOv2, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/conda.yaml: -------------------------------------------------------------------------------- 1 | name: dinov2 2 | channels: 3 | - defaults 4 | - pytorch 5 | - nvidia 6 | - xformers 7 | - conda-forge 8 | dependencies: 9 | - python=3.9 10 | - pytorch::pytorch=2.0.0 11 | - pytorch::pytorch-cuda=11.7.0 12 | - pytorch::torchvision=0.15.0 13 | - omegaconf 14 | - torchmetrics=0.10.3 15 | - fvcore 16 | - iopath 17 | - xformers::xformers=0.0.18 18 | - pip 19 | - pip: 20 | - git+https://github.com/facebookincubator/submitit 21 | - --extra-index-url https://pypi.nvidia.com 22 | - cuml-cu11 23 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | __version__ = "0.0.1" 8 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pathlib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | 12 | def load_config(config_name: str): 13 | config_filename = config_name + ".yaml" 14 | return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename) 15 | 16 | 17 | dinov2_default_config = load_config("ssl_default_config") 18 | 19 | 20 | def load_and_merge_config(config_name: str): 21 | default_config = OmegaConf.create(dinov2_default_config) 22 | loaded_config = load_config(config_name) 23 | return OmegaConf.merge(default_config, loaded_config) 24 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/eval/vitb14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_base 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/eval/vitg14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_giant2 3 | patch_size: 14 4 | ffn_layer: swiglufused 5 | crops: 6 | global_crops_size: 518 # this is to set up the position embeddings properly 7 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/eval/vitl14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_large 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/eval/vits14_pretrain.yaml: -------------------------------------------------------------------------------- 1 | student: 2 | arch: vit_small 3 | patch_size: 14 4 | crops: 5 | global_crops_size: 518 # this is to set up the position embeddings properly 6 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/ssl_default_config.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHTS: '' 3 | compute_precision: 4 | grad_scaler: true 5 | teacher: 6 | backbone: 7 | sharding_strategy: SHARD_GRAD_OP 8 | mixed_precision: 9 | param_dtype: fp16 10 | reduce_dtype: fp16 11 | buffer_dtype: fp32 12 | dino_head: 13 | sharding_strategy: SHARD_GRAD_OP 14 | mixed_precision: 15 | param_dtype: fp16 16 | reduce_dtype: fp16 17 | buffer_dtype: fp32 18 | ibot_head: 19 | sharding_strategy: SHARD_GRAD_OP 20 | mixed_precision: 21 | param_dtype: fp16 22 | reduce_dtype: fp16 23 | buffer_dtype: fp32 24 | student: 25 | backbone: 26 | sharding_strategy: SHARD_GRAD_OP 27 | mixed_precision: 28 | param_dtype: fp16 29 | reduce_dtype: fp16 30 | buffer_dtype: fp32 31 | dino_head: 32 | sharding_strategy: SHARD_GRAD_OP 33 | mixed_precision: 34 | param_dtype: fp16 35 | reduce_dtype: fp32 36 | buffer_dtype: fp32 37 | ibot_head: 38 | sharding_strategy: SHARD_GRAD_OP 39 | mixed_precision: 40 | param_dtype: fp16 41 | reduce_dtype: fp32 42 | buffer_dtype: fp32 43 | dino: 44 | loss_weight: 1.0 45 | head_n_prototypes: 65536 46 | head_bottleneck_dim: 256 47 | head_nlayers: 3 48 | head_hidden_dim: 2048 49 | koleo_loss_weight: 0.1 50 | ibot: 51 | loss_weight: 1.0 52 | mask_sample_probability: 0.5 53 | mask_ratio_min_max: 54 | - 0.1 55 | - 0.5 56 | separate_head: false 57 | head_n_prototypes: 65536 58 | head_bottleneck_dim: 256 59 | head_nlayers: 3 60 | head_hidden_dim: 2048 61 | train: 62 | batch_size_per_gpu: 64 63 | dataset_path: ImageNet:split=TRAIN 64 | output_dir: . 65 | saveckp_freq: 20 66 | seed: 0 67 | num_workers: 10 68 | OFFICIAL_EPOCH_LENGTH: 1250 69 | cache_dataset: true 70 | centering: "centering" # or "sinkhorn_knopp" 71 | student: 72 | arch: vit_large 73 | patch_size: 16 74 | drop_path_rate: 0.3 75 | layerscale: 1.0e-05 76 | drop_path_uniform: true 77 | pretrained_weights: '' 78 | ffn_layer: "mlp" 79 | block_chunks: 0 80 | qkv_bias: true 81 | proj_bias: true 82 | ffn_bias: true 83 | teacher: 84 | momentum_teacher: 0.992 85 | final_momentum_teacher: 1 86 | warmup_teacher_temp: 0.04 87 | teacher_temp: 0.07 88 | warmup_teacher_temp_epochs: 30 89 | optim: 90 | epochs: 100 91 | weight_decay: 0.04 92 | weight_decay_end: 0.4 93 | base_lr: 0.004 # learning rate for a batch size of 1024 94 | lr: 0. # will be set after applying scaling rule 95 | warmup_epochs: 10 96 | min_lr: 1.0e-06 97 | clip_grad: 3.0 98 | freeze_last_layer_epochs: 1 99 | scaling_rule: sqrt_wrt_1024 100 | patch_embed_lr_mult: 0.2 101 | layerwise_decay: 0.9 102 | adamw_beta1: 0.9 103 | adamw_beta2: 0.999 104 | crops: 105 | global_crops_scale: 106 | - 0.32 107 | - 1.0 108 | local_crops_number: 8 109 | local_crops_scale: 110 | - 0.05 111 | - 0.32 112 | global_crops_size: 224 113 | local_crops_size: 96 114 | evaluation: 115 | eval_period_iterations: 12500 116 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/train/vitg14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 12 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_giant2 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/train/vitl14.yaml: -------------------------------------------------------------------------------- 1 | dino: 2 | head_n_prototypes: 131072 3 | head_bottleneck_dim: 384 4 | ibot: 5 | separate_head: true 6 | head_n_prototypes: 131072 7 | train: 8 | batch_size_per_gpu: 32 9 | dataset_path: ImageNet22k 10 | centering: sinkhorn_knopp 11 | student: 12 | arch: vit_large 13 | patch_size: 14 14 | drop_path_rate: 0.4 15 | ffn_layer: swiglufused 16 | block_chunks: 4 17 | teacher: 18 | momentum_teacher: 0.994 19 | optim: 20 | epochs: 500 21 | weight_decay_end: 0.2 22 | base_lr: 2.0e-04 # learning rate for a batch size of 1024 23 | warmup_epochs: 80 24 | layerwise_decay: 1.0 25 | crops: 26 | local_crops_size: 98 -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/configs/train/vitl16_short.yaml: -------------------------------------------------------------------------------- 1 | # this corresponds to the default config 2 | train: 3 | dataset_path: ImageNet:split=TRAIN 4 | batch_size_per_gpu: 64 5 | student: 6 | block_chunks: 4 7 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .adapters import DatasetWithEnumeratedTargets 8 | from .loaders import make_data_loader, make_dataset, SamplerType 9 | from .collate import collate_data_and_cast 10 | from .masking import MaskingGenerator 11 | from .augmentations import DataAugmentationDINO 12 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/adapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any, Tuple 8 | 9 | from torch.utils.data import Dataset 10 | 11 | 12 | class DatasetWithEnumeratedTargets(Dataset): 13 | def __init__(self, dataset): 14 | self._dataset = dataset 15 | 16 | def get_image_data(self, index: int) -> bytes: 17 | return self._dataset.get_image_data(index) 18 | 19 | def get_target(self, index: int) -> Tuple[Any, int]: 20 | target = self._dataset.get_target(index) 21 | return (index, target) 22 | 23 | def get_sample_decoder(self, index: int) -> Any: 24 | return self._dataset.get_sample_decoder(index) 25 | 26 | def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]: 27 | image, target = self._dataset[index] 28 | target = index if target is None else target 29 | return image, (index, target) 30 | 31 | def __len__(self) -> int: 32 | return len(self._dataset) 33 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/collate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import random 9 | 10 | 11 | def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None): 12 | # dtype = torch.half # TODO: Remove 13 | 14 | n_global_crops = len(samples_list[0][0]["global_crops"]) 15 | n_local_crops = len(samples_list[0][0]["local_crops"]) 16 | 17 | collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list]) 18 | 19 | collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list]) 20 | 21 | B = len(collated_global_crops) 22 | N = n_tokens 23 | n_samples_masked = int(B * mask_probability) 24 | probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1) 25 | upperbound = 0 26 | masks_list = [] 27 | for i in range(0, n_samples_masked): 28 | prob_min = probs[i] 29 | prob_max = probs[i + 1] 30 | masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max))))) 31 | upperbound += int(N * prob_max) 32 | for i in range(n_samples_masked, B): 33 | masks_list.append(torch.BoolTensor(mask_generator(0))) 34 | 35 | random.shuffle(masks_list) 36 | 37 | collated_masks = torch.stack(masks_list).flatten(1) 38 | mask_indices_list = collated_masks.flatten().nonzero().flatten() 39 | 40 | masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks] 41 | 42 | return { 43 | "collated_global_crops": collated_global_crops.to(dtype), 44 | "collated_local_crops": collated_local_crops.to(dtype), 45 | "collated_masks": collated_masks, 46 | "mask_indices_list": mask_indices_list, 47 | "masks_weight": masks_weight, 48 | "upperbound": upperbound, 49 | "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long), 50 | } 51 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .image_net import ImageNet 8 | from .image_net_22k import ImageNet22k 9 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/datasets/decoders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from io import BytesIO 8 | from typing import Any, Tuple 9 | 10 | from PIL import Image 11 | 12 | 13 | class Decoder: 14 | def decode(self) -> Any: 15 | raise NotImplementedError 16 | 17 | 18 | class ImageDataDecoder(Decoder): 19 | def __init__(self, image_data: bytes) -> None: 20 | self._image_data = image_data 21 | 22 | def decode(self) -> Image: 23 | f = BytesIO(self._image_data) 24 | return Image.open(f).convert(mode="RGB") 25 | 26 | 27 | class TargetDecoder(Decoder): 28 | def __init__(self, target: Any): 29 | self._target = target 30 | 31 | def decode(self) -> Any: 32 | return self._target 33 | 34 | 35 | class TupleDecoder(Decoder): 36 | def __init__(self, *decoders: Decoder): 37 | self._decoders: Tuple[Decoder, ...] = decoders 38 | 39 | def decode(self) -> Any: 40 | return (decoder.decode() for decoder in self._decoders) 41 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/datasets/extended.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any, Tuple 8 | 9 | from torchvision.datasets import VisionDataset 10 | 11 | from .decoders import Decoder, TargetDecoder, ImageDataDecoder, TupleDecoder 12 | 13 | 14 | class ExtendedVisionDataset(VisionDataset): 15 | def __init__(self, *args, **kwargs) -> None: 16 | super().__init__(*args, **kwargs) # type: ignore 17 | 18 | def get_image_data(self, index: int) -> bytes: 19 | raise NotImplementedError 20 | 21 | def get_target(self, index: int) -> Any: 22 | raise NotImplementedError 23 | 24 | def __getitem__(self, index: int) -> Tuple[Any, Any]: 25 | try: 26 | image_data = self.get_image_data(index) 27 | image = ImageDataDecoder(image_data).decode() 28 | except Exception as e: 29 | raise RuntimeError(f"can not read image for sample {index}") from e 30 | target = self.get_target(index) 31 | target = TargetDecoder(target).decode() 32 | 33 | if self.transforms is not None: 34 | image, target = self.transforms(image, target) 35 | 36 | return image, target 37 | 38 | def get_sample_decoder(self, index: int) -> Decoder: 39 | image_data = self.get_image_data(index) 40 | target = self.get_target(index) 41 | return TupleDecoder( 42 | ImageDataDecoder(image_data), 43 | TargetDecoder(target), 44 | ) 45 | 46 | def __len__(self) -> int: 47 | raise NotImplementedError 48 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/data/masking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import random 8 | import math 9 | import numpy as np 10 | 11 | 12 | class MaskingGenerator: 13 | def __init__( 14 | self, 15 | input_size, 16 | num_masking_patches=None, 17 | min_num_patches=4, 18 | max_num_patches=None, 19 | min_aspect=0.3, 20 | max_aspect=None, 21 | ): 22 | if not isinstance(input_size, tuple): 23 | input_size = (input_size,) * 2 24 | self.height, self.width = input_size 25 | 26 | self.num_patches = self.height * self.width 27 | self.num_masking_patches = num_masking_patches 28 | 29 | self.min_num_patches = min_num_patches 30 | self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches 31 | 32 | max_aspect = max_aspect or 1 / min_aspect 33 | self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) 34 | 35 | def __repr__(self): 36 | repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( 37 | self.height, 38 | self.width, 39 | self.min_num_patches, 40 | self.max_num_patches, 41 | self.num_masking_patches, 42 | self.log_aspect_ratio[0], 43 | self.log_aspect_ratio[1], 44 | ) 45 | return repr_str 46 | 47 | def get_shape(self): 48 | return self.height, self.width 49 | 50 | def _mask(self, mask, max_mask_patches): 51 | delta = 0 52 | for _ in range(10): 53 | target_area = random.uniform(self.min_num_patches, max_mask_patches) 54 | aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) 55 | h = int(round(math.sqrt(target_area * aspect_ratio))) 56 | w = int(round(math.sqrt(target_area / aspect_ratio))) 57 | if w < self.width and h < self.height: 58 | top = random.randint(0, self.height - h) 59 | left = random.randint(0, self.width - w) 60 | 61 | num_masked = mask[top : top + h, left : left + w].sum() 62 | # Overlap 63 | if 0 < h * w - num_masked <= max_mask_patches: 64 | for i in range(top, top + h): 65 | for j in range(left, left + w): 66 | if mask[i, j] == 0: 67 | mask[i, j] = 1 68 | delta += 1 69 | 70 | if delta > 0: 71 | break 72 | return delta 73 | 74 | def __call__(self, num_masking_patches=0): 75 | mask = np.zeros(shape=self.get_shape(), dtype=bool) 76 | mask_count = 0 77 | while mask_count < num_masking_patches: 78 | max_mask_patches = num_masking_patches - mask_count 79 | max_mask_patches = min(max_mask_patches, self.max_num_patches) 80 | 81 | delta = self._mask(mask, max_mask_patches) 82 | if delta == 0: 83 | break 84 | else: 85 | mask_count += delta 86 | 87 | return mask 88 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/eval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import argparse 8 | from typing import Any, List, Optional, Tuple 9 | 10 | import torch 11 | import torch.backends.cudnn as cudnn 12 | 13 | from dinov2.models import build_model_from_cfg 14 | from dinov2.utils.config import setup 15 | import dinov2.utils.utils as dinov2_utils 16 | 17 | 18 | def get_args_parser( 19 | description: Optional[str] = None, 20 | parents: Optional[List[argparse.ArgumentParser]] = [], 21 | add_help: bool = True, 22 | ): 23 | parser = argparse.ArgumentParser( 24 | description=description, 25 | parents=parents, 26 | add_help=add_help, 27 | ) 28 | parser.add_argument( 29 | "--config-file", 30 | type=str, 31 | help="Model configuration file", 32 | ) 33 | parser.add_argument( 34 | "--pretrained-weights", 35 | type=str, 36 | help="Pretrained model weights", 37 | ) 38 | parser.add_argument( 39 | "--output-dir", 40 | default="", 41 | type=str, 42 | help="Output directory to write results and logs", 43 | ) 44 | parser.add_argument( 45 | "--opts", 46 | help="Extra configuration options", 47 | default=[], 48 | nargs="+", 49 | ) 50 | return parser 51 | 52 | 53 | def get_autocast_dtype(config): 54 | teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype 55 | if teacher_dtype_str == "fp16": 56 | return torch.half 57 | elif teacher_dtype_str == "bf16": 58 | return torch.bfloat16 59 | else: 60 | return torch.float 61 | 62 | 63 | def build_model_for_eval(config, pretrained_weights): 64 | model, _ = build_model_from_cfg(config, only_teacher=True) 65 | dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher") 66 | model.eval() 67 | model.cuda() 68 | return model 69 | 70 | 71 | def setup_and_build_model(args) -> Tuple[Any, torch.dtype]: 72 | cudnn.benchmark = True 73 | config = setup(args) 74 | model = build_model_for_eval(config, args.pretrained_weights) 75 | autocast_dtype = get_autocast_dtype(config) 76 | return model, autocast_dtype 77 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .dino_head import DINOHead 8 | from .mlp import Mlp 9 | from .patch_embed import PatchEmbed 10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 11 | from .block import NestedTensorBlock 12 | from .attention import MemEffAttention 13 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | from torch import Tensor 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | try: 21 | from xformers.ops import memory_efficient_attention, unbind, fmha 22 | 23 | XFORMERS_AVAILABLE = True 24 | except ImportError: 25 | logger.warning("xFormers not available") 26 | XFORMERS_AVAILABLE = False 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__( 31 | self, 32 | dim: int, 33 | num_heads: int = 8, 34 | qkv_bias: bool = False, 35 | proj_bias: bool = True, 36 | attn_drop: float = 0.0, 37 | proj_drop: float = 0.0, 38 | ) -> None: 39 | super().__init__() 40 | self.num_heads = num_heads 41 | head_dim = dim // num_heads 42 | self.scale = head_dim**-0.5 43 | 44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | B, N, C = x.shape 51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 52 | 53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 54 | attn = q @ k.transpose(-2, -1) 55 | 56 | attn = attn.softmax(dim=-1) 57 | attn = self.attn_drop(attn) 58 | 59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 60 | x = self.proj(x) 61 | x = self.proj_drop(x) 62 | return x 63 | 64 | 65 | class MemEffAttention(Attention): 66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 67 | if not XFORMERS_AVAILABLE: 68 | assert attn_bias is None, "xFormers is required for nested tensors usage" 69 | return super().forward(x) 70 | 71 | B, N, C = x.shape 72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 73 | 74 | q, k, v = unbind(qkv, 2) 75 | 76 | if attn_bias is not None: 77 | self_att_op = fmha.MemoryEfficientAttentionFlashAttentionOp 78 | else: 79 | self_att_op = None 80 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=self_att_op) 81 | x = x.reshape([B, N, C]) 82 | 83 | x = self.proj(x) 84 | x = self.proj_drop(x) 85 | return x 86 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/dino_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn.init import trunc_normal_ 10 | from torch.nn.utils import weight_norm 11 | 12 | 13 | class DINOHead(nn.Module): 14 | def __init__( 15 | self, 16 | in_dim, 17 | out_dim, 18 | use_bn=False, 19 | nlayers=3, 20 | hidden_dim=2048, 21 | bottleneck_dim=256, 22 | mlp_bias=True, 23 | ): 24 | super().__init__() 25 | nlayers = max(nlayers, 1) 26 | self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) 27 | self.apply(self._init_weights) 28 | self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 29 | self.last_layer.weight_g.data.fill_(1) 30 | 31 | def _init_weights(self, m): 32 | if isinstance(m, nn.Linear): 33 | trunc_normal_(m.weight, std=0.02) 34 | if isinstance(m, nn.Linear) and m.bias is not None: 35 | nn.init.constant_(m.bias, 0) 36 | 37 | def forward(self, x): 38 | x = self.mlp(x) 39 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 40 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 41 | x = self.last_layer(x) 42 | return x 43 | 44 | 45 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): 46 | if nlayers == 1: 47 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 48 | else: 49 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 50 | if use_bn: 51 | layers.append(nn.BatchNorm1d(hidden_dim)) 52 | layers.append(nn.GELU()) 53 | for _ in range(nlayers - 2): 54 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 55 | if use_bn: 56 | layers.append(nn.BatchNorm1d(hidden_dim)) 57 | layers.append(nn.GELU()) 58 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 59 | return nn.Sequential(*layers) 60 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | from torch import Tensor 14 | import torch.nn as nn 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 68 | 69 | def forward(self, x: Tensor) -> Tensor: 70 | _, _, H, W = x.shape 71 | patch_H, patch_W = self.patch_size 72 | 73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 75 | 76 | x = self.proj(x) # B C H W 77 | H, W = x.size(2), x.size(3) 78 | x = x.flatten(2).transpose(1, 2) # B HW C 79 | x = self.norm(x) 80 | if not self.flatten_embedding: 81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 82 | return x 83 | 84 | def flops(self) -> float: 85 | Ho, Wo = self.patches_resolution 86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 87 | if self.norm is not None: 88 | flops += Ho * Wo * self.embed_dim 89 | return flops 90 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/loss/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .dino_clstoken_loss import DINOLoss 8 | from .ibot_patch_loss import iBOTPatchLoss 9 | from .koleo_loss import KoLeoLoss 10 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/loss/koleo_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | # import torch.distributed as dist 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class KoLeoLoss(nn.Module): 20 | """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search""" 21 | 22 | def __init__(self): 23 | super().__init__() 24 | self.pdist = nn.PairwiseDistance(2, eps=1e-8) 25 | 26 | def pairwise_NNs_inner(self, x): 27 | """ 28 | Pairwise nearest neighbors for L2-normalized vectors. 29 | Uses Torch rather than Faiss to remain on GPU. 30 | """ 31 | # parwise dot products (= inverse distance) 32 | dots = torch.mm(x, x.t()) 33 | n = x.shape[0] 34 | dots.view(-1)[:: (n + 1)].fill_(-1) # Trick to fill diagonal with -1 35 | # max inner prod -> min distance 36 | _, I = torch.max(dots, dim=1) # noqa: E741 37 | return I 38 | 39 | def forward(self, student_output, eps=1e-8): 40 | """ 41 | Args: 42 | student_output (BxD): backbone output of student 43 | """ 44 | with torch.cuda.amp.autocast(enabled=False): 45 | student_output = F.normalize(student_output, eps=eps, p=2, dim=-1) 46 | I = self.pairwise_NNs_inner(student_output) # noqa: E741 47 | distances = self.pdist(student_output, student_output[I]) # BxD, BxD -> B 48 | loss = -torch.log(distances + eps).mean() 49 | return loss 50 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | from . import vision_transformer as vits 10 | 11 | 12 | logger = logging.getLogger("dinov2") 13 | 14 | 15 | def build_model(args, only_teacher=False, img_size=224): 16 | args.arch = args.arch.removesuffix("_memeff") 17 | if "vit" in args.arch: 18 | vit_kwargs = dict( 19 | img_size=img_size, 20 | patch_size=args.patch_size, 21 | init_values=args.layerscale, 22 | ffn_layer=args.ffn_layer, 23 | block_chunks=args.block_chunks, 24 | qkv_bias=args.qkv_bias, 25 | proj_bias=args.proj_bias, 26 | ffn_bias=args.ffn_bias, 27 | ) 28 | teacher = vits.__dict__[args.arch](**vit_kwargs) 29 | if only_teacher: 30 | return teacher, teacher.embed_dim 31 | student = vits.__dict__[args.arch]( 32 | **vit_kwargs, 33 | drop_path_rate=args.drop_path_rate, 34 | drop_path_uniform=args.drop_path_uniform, 35 | ) 36 | embed_dim = student.embed_dim 37 | return student, teacher, embed_dim 38 | 39 | 40 | def build_model_from_cfg(cfg, only_teacher=False): 41 | return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size) 42 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/run/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/run/eval/knn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser 12 | from dinov2.logging import setup_logging 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Evaluator: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.eval.knn import main as knn_main 25 | 26 | self._setup_args() 27 | knn_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 k-NN evaluation" 47 | knn_args_parser = get_knn_args_parser(add_help=False) 48 | parents = [knn_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Evaluator, args, name="dinov2:knn") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/run/eval/linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser 12 | from dinov2.logging import setup_logging 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Evaluator: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.eval.linear import main as linear_main 25 | 26 | self._setup_args() 27 | linear_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 linear evaluation" 47 | linear_args_parser = get_linear_args_parser(add_help=False) 48 | parents = [linear_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Evaluator, args, name="dinov2:linear") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/run/eval/log_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser 12 | from dinov2.logging import setup_logging 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Evaluator: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.eval.log_regression import main as log_regression_main 25 | 26 | self._setup_args() 27 | log_regression_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 logistic evaluation" 47 | log_regression_args_parser = get_log_regression_args_parser(add_help=False) 48 | parents = [log_regression_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Evaluator, args, name="dinov2:logreg") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/run/train/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | import os 9 | import sys 10 | 11 | from dinov2.logging import setup_logging 12 | from dinov2.train import get_args_parser as get_train_args_parser 13 | from dinov2.run.submit import get_args_parser, submit_jobs 14 | 15 | 16 | logger = logging.getLogger("dinov2") 17 | 18 | 19 | class Trainer(object): 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | def __call__(self): 24 | from dinov2.train import main as train_main 25 | 26 | self._setup_args() 27 | train_main(self.args) 28 | 29 | def checkpoint(self): 30 | import submitit 31 | 32 | logger.info(f"Requeuing {self.args}") 33 | empty = type(self)(self.args) 34 | return submitit.helpers.DelayedSubmission(empty) 35 | 36 | def _setup_args(self): 37 | import submitit 38 | 39 | job_env = submitit.JobEnvironment() 40 | self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id)) 41 | logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 42 | logger.info(f"Args: {self.args}") 43 | 44 | 45 | def main(): 46 | description = "Submitit launcher for DINOv2 training" 47 | train_args_parser = get_train_args_parser(add_help=False) 48 | parents = [train_args_parser] 49 | args_parser = get_args_parser(description=description, parents=parents) 50 | args = args_parser.parse_args() 51 | 52 | setup_logging() 53 | 54 | assert os.path.exists(args.config_file), "Configuration file does not exist!" 55 | submit_jobs(Trainer, args, name="dinov2:train") 56 | return 0 57 | 58 | 59 | if __name__ == "__main__": 60 | sys.exit(main()) 61 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .train import get_args_parser, main 8 | from .ssl_meta_arch import SSLMetaArch 9 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/utils/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | import logging 9 | import os 10 | 11 | from omegaconf import OmegaConf 12 | 13 | import dinov2.distributed as distributed 14 | from dinov2.logging import setup_logging 15 | from dinov2.utils import utils 16 | from dinov2.configs import dinov2_default_config 17 | 18 | 19 | logger = logging.getLogger("dinov2") 20 | 21 | 22 | def apply_scaling_rules_to_cfg(cfg): # to fix 23 | if cfg.optim.scaling_rule == "sqrt_wrt_1024": 24 | base_lr = cfg.optim.base_lr 25 | cfg.optim.lr = base_lr 26 | cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0) 27 | logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}") 28 | else: 29 | raise NotImplementedError 30 | return cfg 31 | 32 | 33 | def write_config(cfg, output_dir, name="config.yaml"): 34 | logger.info(OmegaConf.to_yaml(cfg)) 35 | saved_cfg_path = os.path.join(output_dir, name) 36 | with open(saved_cfg_path, "w") as f: 37 | OmegaConf.save(config=cfg, f=f) 38 | return saved_cfg_path 39 | 40 | 41 | def get_cfg_from_args(args): 42 | args.output_dir = os.path.abspath(args.output_dir) 43 | args.opts += [f"train.output_dir={args.output_dir}"] 44 | default_cfg = OmegaConf.create(dinov2_default_config) 45 | cfg = OmegaConf.load(args.config_file) 46 | cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts)) 47 | return cfg 48 | 49 | 50 | def default_setup(args): 51 | distributed.enable(overwrite=True) 52 | seed = getattr(args, "seed", 0) 53 | rank = distributed.get_global_rank() 54 | 55 | global logger 56 | setup_logging(output=args.output_dir, level=logging.INFO) 57 | logger = logging.getLogger("dinov2") 58 | 59 | utils.fix_random_seeds(seed + rank) 60 | logger.info("git:\n {}\n".format(utils.get_sha())) 61 | logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) 62 | 63 | 64 | def setup(args): 65 | """ 66 | Create configs and perform basic setups. 67 | """ 68 | cfg = get_cfg_from_args(args) 69 | os.makedirs(args.output_dir, exist_ok=True) 70 | default_setup(args) 71 | apply_scaling_rules_to_cfg(cfg) 72 | write_config(cfg, args.output_dir) 73 | return cfg 74 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/dinov2/utils/dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from typing import Dict, Union 9 | 10 | import numpy as np 11 | import torch 12 | 13 | 14 | TypeSpec = Union[str, np.dtype, torch.dtype] 15 | 16 | 17 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = { 18 | np.dtype("bool"): torch.bool, 19 | np.dtype("uint8"): torch.uint8, 20 | np.dtype("int8"): torch.int8, 21 | np.dtype("int16"): torch.int16, 22 | np.dtype("int32"): torch.int32, 23 | np.dtype("int64"): torch.int64, 24 | np.dtype("float16"): torch.float16, 25 | np.dtype("float32"): torch.float32, 26 | np.dtype("float64"): torch.float64, 27 | np.dtype("complex64"): torch.complex64, 28 | np.dtype("complex128"): torch.complex128, 29 | } 30 | 31 | 32 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype: 33 | if isinstance(dtype, torch.dtype): 34 | return dtype 35 | if isinstance(dtype, str): 36 | dtype = np.dtype(dtype) 37 | assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}" 38 | return _NUMPY_TO_TORCH_DTYPE[dtype] 39 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | 4 | [tool.pylint.master] 5 | persistent = false 6 | score = false 7 | 8 | [tool.pylint.messages_control] 9 | disable = "all" 10 | enable = [ 11 | "miscellaneous", 12 | "similarities", 13 | ] 14 | 15 | [tool.pylint.similarities] 16 | ignore-comments = true 17 | ignore-docstrings = true 18 | ignore-imports = true 19 | min-similarity-lines = 8 20 | 21 | [tool.pylint.reports] 22 | reports = false 23 | 24 | [tool.pylint.miscellaneous] 25 | notes = [ 26 | "FIXME", 27 | "XXX", 28 | "TODO", 29 | ] 30 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black==22.6.0 2 | flake8==5.0.4 3 | pylint==2.15.0 4 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu117 2 | torch==2.0.0 3 | torchvision==0.15.0 4 | omegaconf 5 | torchmetrics==0.10.3 6 | fvcore 7 | iopath 8 | xformers==0.0.18 9 | submitit 10 | --extra-index-url https://pypi.nvidia.com 11 | cuml-cu11 12 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -n "$1" ]; then 4 | echo "linting \"$1\"" 5 | fi 6 | 7 | echo "running black" 8 | if [ -n "$1" ]; then 9 | black "$1" 10 | else 11 | black dinov2 12 | fi 13 | 14 | echo "running flake8" 15 | if [ -n "$1" ]; then 16 | flake8 "$1" 17 | else 18 | flake8 19 | fi 20 | 21 | echo "running pylint" 22 | if [ -n "$1" ]; then 23 | pylint "$1" 24 | else 25 | pylint dinov2 26 | fi 27 | 28 | exit 0 29 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E203,E501,W503 4 | per-file-ignores = 5 | __init__.py:F401 6 | -------------------------------------------------------------------------------- /models/anydoor/dinov2/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from pathlib import Path 8 | import re 9 | from typing import List, Tuple 10 | 11 | from setuptools import setup, find_packages 12 | 13 | 14 | NAME = "dinov2" 15 | DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method." 16 | 17 | URL = "https://github.com/facebookresearch/dinov2" 18 | AUTHOR = "FAIR" 19 | REQUIRES_PYTHON = ">=3.9.0" 20 | HERE = Path(__file__).parent 21 | 22 | 23 | try: 24 | with open(HERE / "README.md", encoding="utf-8") as f: 25 | long_description = "\n" + f.read() 26 | except FileNotFoundError: 27 | long_description = DESCRIPTION 28 | 29 | 30 | def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]: 31 | requirements = [] 32 | extra_indices = [] 33 | with open(path) as f: 34 | for line in f.readlines(): 35 | line = line.rstrip("\r\n") 36 | if line.startswith("--extra-index-url "): 37 | extra_indices.append(line[18:]) 38 | continue 39 | requirements.append(line) 40 | return requirements, extra_indices 41 | 42 | 43 | def get_package_version() -> str: 44 | with open(HERE / "dinov2/__init__.py") as f: 45 | result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M) 46 | if result: 47 | return result.group(1) 48 | raise RuntimeError("Can't get package version") 49 | 50 | 51 | requirements, extra_indices = get_requirements() 52 | version = get_package_version() 53 | dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt") 54 | 55 | 56 | setup( 57 | name=NAME, 58 | version=version, 59 | description=DESCRIPTION, 60 | long_description=long_description, 61 | long_description_content_type="text/markdown", 62 | author=AUTHOR, 63 | python_requires=REQUIRES_PYTHON, 64 | url=URL, 65 | packages=find_packages(), 66 | package_data={ 67 | "": ["*.yaml"], 68 | }, 69 | install_requires=requirements, 70 | dependency_links=extra_indices, 71 | extras_require={ 72 | "dev": dev_requirements, 73 | }, 74 | install_package_data=True, 75 | license="CC-BY-NC", 76 | license_files=("LICENSE",), 77 | classifiers=[ 78 | # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py 79 | "Development Status :: 3 - Alpha", 80 | "Intended Audience :: Developers", 81 | "Intended Audience :: Science/Research", 82 | "License :: Other/Proprietary License", 83 | "Programming Language :: Python :: 3.9", 84 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 85 | "Topic :: Software Development :: Libraries :: Python Modules", 86 | ], 87 | ) 88 | -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/00.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/01.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/02.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/03.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/04.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/04.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/06.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/07.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/08.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/13.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/17.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/BG/22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/22.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/00.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/01.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/04.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/06.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/07.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/09.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/18.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/22.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/25.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/28.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/33.png -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/36.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/36.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/39.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/39.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/43.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/43.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/44.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/44.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/Gradio/FG/50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/50.jpg -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/BG/000000047948_GT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000047948_GT.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/BG/000000047948_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000047948_mask.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/BG/000000309203_GT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000309203_GT.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/BG/000000309203_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000309203_mask.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/FG/00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/00.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/FG/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/01.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/FG/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/02.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/FG/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/03.png -------------------------------------------------------------------------------- /models/anydoor/examples/TestDreamBooth/GEN/gen_res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/GEN/gen_res.png -------------------------------------------------------------------------------- /models/anydoor/iseg/coarse_mask_refine.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/iseg/coarse_mask_refine.pth -------------------------------------------------------------------------------- /models/anydoor/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/data/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/data/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ldm.modules.midas.api import load_midas_transform 4 | 5 | 6 | class AddMiDaS(object): 7 | def __init__(self, model_type): 8 | super().__init__() 9 | self.transform = load_midas_transform(model_type) 10 | 11 | def pt2np(self, x): 12 | x = ((x + 1.0) * .5).detach().cpu().numpy() 13 | return x 14 | 15 | def np2pt(self, x): 16 | x = torch.from_numpy(x) * 2 - 1. 17 | return x 18 | 19 | def __call__(self, sample): 20 | # sample['jpg'] is tensor hwc in [-1, 1] at this point 21 | x = self.pt2np(sample['jpg']) 22 | x = self.transform({"image": x})["image"] 23 | sample['midas_in'] = x 24 | return sample -------------------------------------------------------------------------------- /models/anydoor/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /models/anydoor/ldm/models/diffusion/dpm_solver/sampler.py: -------------------------------------------------------------------------------- 1 | """SAMPLING ONLY.""" 2 | import torch 3 | 4 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver 5 | 6 | 7 | MODEL_TYPES = { 8 | "eps": "noise", 9 | "v": "v" 10 | } 11 | 12 | 13 | class DPMSolverSampler(object): 14 | def __init__(self, model, **kwargs): 15 | super().__init__() 16 | self.model = model 17 | to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device) 18 | self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod)) 19 | 20 | def register_buffer(self, name, attr): 21 | if type(attr) == torch.Tensor: 22 | if attr.device != torch.device("cuda"): 23 | attr = attr.to(torch.device("cuda")) 24 | setattr(self, name, attr) 25 | 26 | @torch.no_grad() 27 | def sample(self, 28 | S, 29 | batch_size, 30 | shape, 31 | conditioning=None, 32 | callback=None, 33 | normals_sequence=None, 34 | img_callback=None, 35 | quantize_x0=False, 36 | eta=0., 37 | mask=None, 38 | x0=None, 39 | temperature=1., 40 | noise_dropout=0., 41 | score_corrector=None, 42 | corrector_kwargs=None, 43 | verbose=True, 44 | x_T=None, 45 | log_every_t=100, 46 | unconditional_guidance_scale=1., 47 | unconditional_conditioning=None, 48 | # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... 49 | **kwargs 50 | ): 51 | if conditioning is not None: 52 | if isinstance(conditioning, dict): 53 | cbs = conditioning[list(conditioning.keys())[0]].shape[0] 54 | if cbs != batch_size: 55 | print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") 56 | else: 57 | if conditioning.shape[0] != batch_size: 58 | print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") 59 | 60 | # sampling 61 | C, H, W = shape 62 | size = (batch_size, C, H, W) 63 | 64 | print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}') 65 | 66 | device = self.model.betas.device 67 | if x_T is None: 68 | img = torch.randn(size, device=device) 69 | else: 70 | img = x_T 71 | 72 | ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod) 73 | 74 | model_fn = model_wrapper( 75 | lambda x, t, c: self.model.apply_model(x, t, c), 76 | ns, 77 | model_type=MODEL_TYPES[self.model.parameterization], 78 | guidance_type="classifier-free", 79 | condition=conditioning, 80 | unconditional_condition=unconditional_conditioning, 81 | guidance_scale=unconditional_guidance_scale, 82 | ) 83 | 84 | dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False) 85 | x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True) 86 | 87 | return x.to(device), None -------------------------------------------------------------------------------- /models/anydoor/ldm/models/diffusion/sampling_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def append_dims(x, target_dims): 6 | """Appends dimensions to the end of a tensor until it has target_dims dimensions. 7 | From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" 8 | dims_to_append = target_dims - x.ndim 9 | if dims_to_append < 0: 10 | raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') 11 | return x[(...,) + (None,) * dims_to_append] 12 | 13 | 14 | def norm_thresholding(x0, value): 15 | s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) 16 | return x0 * (value / s) 17 | 18 | 19 | def spatial_norm_thresholding(x0, value): 20 | # b c h w 21 | s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) 22 | return x0 * (value / s) -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/midas/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/midas/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/midas/midas/__init__.py -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/midas/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /models/anydoor/ldm/modules/midas/midas/midas_net.py: -------------------------------------------------------------------------------- 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets. 2 | This file contains code that is adapted from 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .base_model import BaseModel 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder 10 | 11 | 12 | class MidasNet(BaseModel): 13 | """Network for monocular depth estimation. 14 | """ 15 | 16 | def __init__(self, path=None, features=256, non_negative=True): 17 | """Init. 18 | 19 | Args: 20 | path (str, optional): Path to saved model. Defaults to None. 21 | features (int, optional): Number of features. Defaults to 256. 22 | backbone (str, optional): Backbone network for encoder. Defaults to resnet50 23 | """ 24 | print("Loading weights: ", path) 25 | 26 | super(MidasNet, self).__init__() 27 | 28 | use_pretrained = False if path is None else True 29 | 30 | self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) 31 | 32 | self.scratch.refinenet4 = FeatureFusionBlock(features) 33 | self.scratch.refinenet3 = FeatureFusionBlock(features) 34 | self.scratch.refinenet2 = FeatureFusionBlock(features) 35 | self.scratch.refinenet1 = FeatureFusionBlock(features) 36 | 37 | self.scratch.output_conv = nn.Sequential( 38 | nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), 39 | Interpolate(scale_factor=2, mode="bilinear"), 40 | nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), 41 | nn.ReLU(True), 42 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 43 | nn.ReLU(True) if non_negative else nn.Identity(), 44 | ) 45 | 46 | if path: 47 | self.load(path) 48 | 49 | def forward(self, x): 50 | """Forward pass. 51 | 52 | Args: 53 | x (tensor): input data (image) 54 | 55 | Returns: 56 | tensor: depth 57 | """ 58 | 59 | layer_1 = self.pretrained.layer1(x) 60 | layer_2 = self.pretrained.layer2(layer_1) 61 | layer_3 = self.pretrained.layer3(layer_2) 62 | layer_4 = self.pretrained.layer4(layer_3) 63 | 64 | layer_1_rn = self.scratch.layer1_rn(layer_1) 65 | layer_2_rn = self.scratch.layer2_rn(layer_2) 66 | layer_3_rn = self.scratch.layer3_rn(layer_3) 67 | layer_4_rn = self.scratch.layer4_rn(layer_4) 68 | 69 | path_4 = self.scratch.refinenet4(layer_4_rn) 70 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 71 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 72 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 73 | 74 | out = self.scratch.output_conv(path_1) 75 | 76 | return torch.squeeze(out, dim=1) 77 | -------------------------------------------------------------------------------- /models/anydoor/requirements.txt: -------------------------------------------------------------------------------- 1 | albumentations==1.3.0 2 | einops==0.3.0 3 | fvcore==0.1.5.post20221221 4 | gradio==3.39.0 5 | numpy==1.23.1 6 | omegaconf==2.1.1 7 | open_clip_torch==2.17.1 8 | opencv_contrib_python==4.3.0.36 9 | opencv_python==4.7.0.72 10 | opencv_python_headless==4.7.0.72 11 | Pillow==9.4.0 12 | pytorch_lightning==1.5.0 13 | safetensors==0.2.7 14 | scipy==1.9.1 15 | setuptools==66.0.0 16 | share==1.0.4 17 | submitit==1.5.1 18 | timm==0.6.12 19 | torch==2.0.0 20 | torchmetrics==0.6.0 21 | tqdm==4.65.0 22 | transformers==4.19.2 23 | xformers==0.0.18 24 | -------------------------------------------------------------------------------- /models/anydoor/run_dataset_debug.py: -------------------------------------------------------------------------------- 1 | from datasets.ytb_vos import YoutubeVOSDataset 2 | from datasets.ytb_vis import YoutubeVISDataset 3 | from datasets.saliency_modular import SaliencyDataset 4 | from datasets.vipseg import VIPSegDataset 5 | from datasets.mvimagenet import MVImageNetDataset 6 | from datasets.sam import SAMDataset 7 | from datasets.dreambooth import DreamBoothDataset 8 | from datasets.uvo import UVODataset 9 | from datasets.uvo_val import UVOValDataset 10 | from datasets.mose import MoseDataset 11 | from datasets.vitonhd import VitonHDDataset 12 | from datasets.fashiontryon import FashionTryonDataset 13 | from datasets.lvis import LvisDataset 14 | from torch.utils.data import ConcatDataset 15 | from torch.utils.data import DataLoader 16 | import numpy as np 17 | import cv2 18 | from omegaconf import OmegaConf 19 | 20 | # Datasets 21 | DConf = OmegaConf.load('./configs/datasets.yaml') 22 | dataset1 = YoutubeVOSDataset(**DConf.Train.YoutubeVOS) 23 | dataset2 = SaliencyDataset(**DConf.Train.Saliency) 24 | dataset3 = VIPSegDataset(**DConf.Train.VIPSeg) 25 | dataset4 = YoutubeVISDataset(**DConf.Train.YoutubeVIS) 26 | dataset5 = MVImageNetDataset(**DConf.Train.MVImageNet) 27 | dataset6 = SAMDataset(**DConf.Train.SAM) 28 | dataset7 = UVODataset(**DConf.Train.UVO.train) 29 | dataset8 = VitonHDDataset(**DConf.Train.VitonHD) 30 | dataset9 = UVOValDataset(**DConf.Train.UVO.val) 31 | dataset10 = MoseDataset(**DConf.Train.Mose) 32 | dataset11 = FashionTryonDataset(**DConf.Train.FashionTryon) 33 | dataset12 = LvisDataset(**DConf.Train.Lvis) 34 | 35 | dataset = dataset5 36 | 37 | 38 | def vis_sample(item): 39 | ref = item['ref']* 255 40 | tar = item['jpg'] * 127.5 + 127.5 41 | hint = item['hint'] * 127.5 + 127.5 42 | step = item['time_steps'] 43 | print(ref.shape, tar.shape, hint.shape, step.shape) 44 | 45 | ref = ref[0].numpy() 46 | tar = tar[0].numpy() 47 | hint_image = hint[0, :,:,:-1].numpy() 48 | hint_mask = hint[0, :,:,-1].numpy() 49 | hint_mask = np.stack([hint_mask,hint_mask,hint_mask],-1) 50 | ref = cv2.resize(ref.astype(np.uint8), (512,512)) 51 | vis = cv2.hconcat([ref.astype(np.float32), hint_image.astype(np.float32), hint_mask.astype(np.float32), tar.astype(np.float32) ]) 52 | cv2.imwrite('sample_vis.jpg',vis[:,:,::-1]) 53 | 54 | 55 | dataloader = DataLoader(dataset, num_workers=8, batch_size=4, shuffle=True) 56 | print('len dataloader: ', len(dataloader)) 57 | for data in dataloader: 58 | vis_sample(data) 59 | 60 | 61 | -------------------------------------------------------------------------------- /models/anydoor/run_train_anydoor.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from torch.utils.data import DataLoader 3 | from datasets.ytb_vos import YoutubeVOSDataset 4 | from datasets.ytb_vis import YoutubeVISDataset 5 | from datasets.saliency_modular import SaliencyDataset 6 | from datasets.vipseg import VIPSegDataset 7 | from datasets.mvimagenet import MVImageNetDataset 8 | from datasets.sam import SAMDataset 9 | from datasets.uvo import UVODataset 10 | from datasets.uvo_val import UVOValDataset 11 | from datasets.mose import MoseDataset 12 | from datasets.vitonhd import VitonHDDataset 13 | from datasets.fashiontryon import FashionTryonDataset 14 | from datasets.lvis import LvisDataset 15 | from cldm.logger import ImageLogger 16 | from cldm.model import create_model, load_state_dict 17 | from torch.utils.data import ConcatDataset 18 | from cldm.hack import disable_verbosity, enable_sliced_attention 19 | from omegaconf import OmegaConf 20 | 21 | save_memory = False 22 | disable_verbosity() 23 | if save_memory: 24 | enable_sliced_attention() 25 | 26 | # Configs 27 | resume_path = 'path/to/weight' 28 | batch_size = 16 29 | logger_freq = 1000 30 | learning_rate = 1e-5 31 | sd_locked = False 32 | only_mid_control = False 33 | n_gpus = 2 34 | accumulate_grad_batches=1 35 | 36 | # First use cpu to load models. Pytorch Lightning will automatically move it to GPUs. 37 | model = create_model('./configs/anydoor.yaml').cpu() 38 | model.load_state_dict(load_state_dict(resume_path, location='cpu')) 39 | model.learning_rate = learning_rate 40 | model.sd_locked = sd_locked 41 | model.only_mid_control = only_mid_control 42 | 43 | # Datasets 44 | DConf = OmegaConf.load('./configs/datasets.yaml') 45 | dataset1 = YoutubeVOSDataset(**DConf.Train.YoutubeVOS) 46 | dataset2 = SaliencyDataset(**DConf.Train.Saliency) 47 | dataset3 = VIPSegDataset(**DConf.Train.VIPSeg) 48 | dataset4 = YoutubeVISDataset(**DConf.Train.YoutubeVIS) 49 | dataset5 = MVImageNetDataset(**DConf.Train.MVImageNet) 50 | dataset6 = SAMDataset(**DConf.Train.SAM) 51 | dataset7 = UVODataset(**DConf.Train.UVO.train) 52 | dataset8 = VitonHDDataset(**DConf.Train.VitonHD) 53 | dataset9 = UVOValDataset(**DConf.Train.UVO.val) 54 | dataset10 = MoseDataset(**DConf.Train.Mose) 55 | dataset11 = FashionTryonDataset(**DConf.Train.FashionTryon) 56 | dataset12 = LvisDataset(**DConf.Train.Lvis) 57 | 58 | image_data = [dataset2, dataset6, dataset12] 59 | video_data = [dataset1, dataset3, dataset4, dataset7, dataset9, dataset10 ] 60 | tryon_data = [dataset8, dataset11] 61 | threed_data = [dataset5] 62 | 63 | # The ratio of each dataset is adjusted by setting the __len__ 64 | dataset = ConcatDataset( image_data + video_data + tryon_data + threed_data + video_data + tryon_data + threed_data ) 65 | dataloader = DataLoader(dataset, num_workers=8, batch_size=batch_size, shuffle=True) 66 | logger = ImageLogger(batch_frequency=logger_freq) 67 | trainer = pl.Trainer(gpus=n_gpus, strategy="ddp", precision=16, accelerator="gpu", callbacks=[logger], progress_bar_refresh_rate=1, accumulate_grad_batches=accumulate_grad_batches) 68 | 69 | # Train! 70 | trainer.fit(model, dataloader) 71 | -------------------------------------------------------------------------------- /models/anydoor/scripts/convert_weight.sh: -------------------------------------------------------------------------------- 1 | python tool_add_control_sd21.py path/v2-1_512-ema-pruned.ckpt path/control_sd21_ini.ckpt -------------------------------------------------------------------------------- /models/anydoor/scripts/inference.sh: -------------------------------------------------------------------------------- 1 | unset WORLD_SIZE 2 | python run_inference.py -------------------------------------------------------------------------------- /models/anydoor/scripts/train.sh: -------------------------------------------------------------------------------- 1 | unset WORLD_SIZE 2 | python run_train_anydoor.py -------------------------------------------------------------------------------- /models/anydoor/tool_add_control_sd21.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | assert len(sys.argv) == 3, 'Args are wrong.' 5 | 6 | input_path = sys.argv[1] 7 | output_path = sys.argv[2] 8 | 9 | assert os.path.exists(input_path), 'Input model does not exist.' 10 | assert not os.path.exists(output_path), 'Output filename already exists.' 11 | assert os.path.exists(os.path.dirname(output_path)), 'Output path is not valid.' 12 | 13 | import torch 14 | from share import * 15 | from cldm.model import create_model 16 | 17 | 18 | def get_node_name(name, parent_name): 19 | if len(name) <= len(parent_name): 20 | return False, '' 21 | p = name[:len(parent_name)] 22 | if p != parent_name: 23 | return False, '' 24 | return True, name[len(parent_name):] 25 | 26 | 27 | model = create_model(config_path='./models/anydoor.yaml') 28 | 29 | pretrained_weights = torch.load(input_path) 30 | if 'state_dict' in pretrained_weights: 31 | pretrained_weights = pretrained_weights['state_dict'] 32 | 33 | scratch_dict = model.state_dict() 34 | 35 | target_dict = {} 36 | for k in scratch_dict.keys(): 37 | 38 | is_control, name = get_node_name(k, 'control_') 39 | if 'control_model.input_blocks.0.0' in k: 40 | print('skipped key: ', k) 41 | continue 42 | 43 | if is_control: 44 | copy_k = 'model.diffusion_' + name 45 | else: 46 | copy_k = k 47 | if copy_k in pretrained_weights: 48 | target_dict[k] = pretrained_weights[copy_k].clone() 49 | else: 50 | target_dict[k] = scratch_dict[k].clone() 51 | print(f'These weights are newly added: {k}') 52 | 53 | model.load_state_dict(target_dict, strict=False) 54 | torch.save(model.state_dict(), output_path) 55 | print('Done.') 56 | -------------------------------------------------------------------------------- /models/canny/__pycache__/canny_filter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/canny_filter.cpython-310.pyc -------------------------------------------------------------------------------- /models/canny/__pycache__/filter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/filter.cpython-310.pyc -------------------------------------------------------------------------------- /models/canny/__pycache__/gaussian.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/gaussian.cpython-310.pyc -------------------------------------------------------------------------------- /models/canny/__pycache__/kernels.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/kernels.cpython-310.pyc -------------------------------------------------------------------------------- /models/canny/__pycache__/sobel.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/sobel.cpython-310.pyc -------------------------------------------------------------------------------- /models/ctrl_adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/ctrl_adapter/__init__.py -------------------------------------------------------------------------------- /models/depth_completion_net/deformconv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import init as init 4 | from torch.nn.modules.utils import _pair, _single 5 | import math 6 | 7 | class ModulatedDeformConv2d(nn.Module): 8 | def __init__(self, 9 | in_channels, 10 | out_channels, 11 | kernel_size, 12 | stride=1, 13 | padding=0, 14 | dilation=1, 15 | groups=1, 16 | deform_groups=1, 17 | bias=True): 18 | super(ModulatedDeformConv2d, self).__init__() 19 | 20 | self.in_channels = in_channels 21 | self.out_channels = out_channels 22 | self.kernel_size = _pair(kernel_size) 23 | self.stride = stride 24 | self.padding = padding 25 | self.dilation = dilation 26 | self.groups = groups 27 | self.deform_groups = deform_groups 28 | self.with_bias = bias 29 | # enable compatibility with nn.Conv2d 30 | self.transposed = False 31 | self.output_padding = _single(0) 32 | 33 | self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) 34 | if bias: 35 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 36 | else: 37 | self.register_parameter('bias', None) 38 | self.init_weights() 39 | 40 | def init_weights(self): 41 | n = self.in_channels 42 | for k in self.kernel_size: 43 | n *= k 44 | stdv = 1. / math.sqrt(n) 45 | self.weight.data.uniform_(-stdv, stdv) 46 | if self.bias is not None: 47 | self.bias.data.zero_() 48 | 49 | if hasattr(self, 'conv_offset'): 50 | self.conv_offset.weight.data.zero_() 51 | self.conv_offset.bias.data.zero_() 52 | 53 | def forward(self, x, offset, mask): 54 | pass -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/beit.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/beit.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/levit.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/levit.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/swin.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/swin2.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin2.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/swin_common.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin_common.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/__pycache__/vit.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/vit.cpython-310.pyc -------------------------------------------------------------------------------- /models/midas/backbones/next_vit.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | import torch.nn as nn 4 | 5 | from pathlib import Path 6 | from .utils import activations, forward_default, get_activation 7 | 8 | from ..external.next_vit.classification.nextvit import * 9 | 10 | 11 | def forward_next_vit(pretrained, x): 12 | return forward_default(pretrained, x, "forward") 13 | 14 | 15 | def _make_next_vit_backbone( 16 | model, 17 | hooks=[2, 6, 36, 39], 18 | ): 19 | pretrained = nn.Module() 20 | 21 | pretrained.model = model 22 | pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1")) 23 | pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2")) 24 | pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3")) 25 | pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4")) 26 | 27 | pretrained.activations = activations 28 | 29 | return pretrained 30 | 31 | 32 | def _make_pretrained_next_vit_large_6m(hooks=None): 33 | model = timm.create_model("nextvit_large") 34 | 35 | hooks = [2, 6, 36, 39] if hooks == None else hooks 36 | return _make_next_vit_backbone( 37 | model, 38 | hooks=hooks, 39 | ) 40 | -------------------------------------------------------------------------------- /models/midas/backbones/swin.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | from .swin_common import _make_swin_backbone 4 | 5 | 6 | def _make_pretrained_swinl12_384(pretrained, hooks=None): 7 | model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained) 8 | 9 | hooks = [1, 1, 17, 1] if hooks == None else hooks 10 | return _make_swin_backbone( 11 | model, 12 | hooks=hooks 13 | ) 14 | -------------------------------------------------------------------------------- /models/midas/backbones/swin2.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | from .swin_common import _make_swin_backbone 4 | 5 | 6 | def _make_pretrained_swin2l24_384(pretrained, hooks=None): 7 | model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained) 8 | 9 | hooks = [1, 1, 17, 1] if hooks == None else hooks 10 | return _make_swin_backbone( 11 | model, 12 | hooks=hooks 13 | ) 14 | 15 | 16 | def _make_pretrained_swin2b24_384(pretrained, hooks=None): 17 | model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained) 18 | 19 | hooks = [1, 1, 17, 1] if hooks == None else hooks 20 | return _make_swin_backbone( 21 | model, 22 | hooks=hooks 23 | ) 24 | 25 | 26 | def _make_pretrained_swin2t16_256(pretrained, hooks=None): 27 | model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained) 28 | 29 | hooks = [1, 1, 5, 1] if hooks == None else hooks 30 | return _make_swin_backbone( 31 | model, 32 | hooks=hooks, 33 | patch_grid=[64, 64] 34 | ) 35 | -------------------------------------------------------------------------------- /models/midas/backbones/swin_common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | from .utils import activations, forward_default, get_activation, Transpose 7 | 8 | 9 | def forward_swin(pretrained, x): 10 | return forward_default(pretrained, x) 11 | 12 | 13 | def _make_swin_backbone( 14 | model, 15 | hooks=[1, 1, 17, 1], 16 | patch_grid=[96, 96] 17 | ): 18 | pretrained = nn.Module() 19 | 20 | pretrained.model = model 21 | pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1")) 22 | pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2")) 23 | pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3")) 24 | pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4")) 25 | 26 | pretrained.activations = activations 27 | 28 | if hasattr(model, "patch_grid"): 29 | used_patch_grid = model.patch_grid 30 | else: 31 | used_patch_grid = patch_grid 32 | 33 | patch_grid_size = np.array(used_patch_grid, dtype=int) 34 | 35 | pretrained.act_postprocess1 = nn.Sequential( 36 | Transpose(1, 2), 37 | nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) 38 | ) 39 | pretrained.act_postprocess2 = nn.Sequential( 40 | Transpose(1, 2), 41 | nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist())) 42 | ) 43 | pretrained.act_postprocess3 = nn.Sequential( 44 | Transpose(1, 2), 45 | nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist())) 46 | ) 47 | pretrained.act_postprocess4 = nn.Sequential( 48 | Transpose(1, 2), 49 | nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist())) 50 | ) 51 | 52 | return pretrained 53 | -------------------------------------------------------------------------------- /models/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /models/midas/midas_net.py: -------------------------------------------------------------------------------- 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets. 2 | This file contains code that is adapted from 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .base_model import BaseModel 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder 10 | 11 | 12 | class MidasNet(BaseModel): 13 | """Network for monocular depth estimation. 14 | """ 15 | 16 | def __init__(self, path=None, features=256, non_negative=True): 17 | """Init. 18 | 19 | Args: 20 | path (str, optional): Path to saved model. Defaults to None. 21 | features (int, optional): Number of features. Defaults to 256. 22 | backbone (str, optional): Backbone network for encoder. Defaults to resnet50 23 | """ 24 | print("Loading weights: ", path) 25 | 26 | super(MidasNet, self).__init__() 27 | 28 | use_pretrained = False if path is None else True 29 | 30 | self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) 31 | 32 | self.scratch.refinenet4 = FeatureFusionBlock(features) 33 | self.scratch.refinenet3 = FeatureFusionBlock(features) 34 | self.scratch.refinenet2 = FeatureFusionBlock(features) 35 | self.scratch.refinenet1 = FeatureFusionBlock(features) 36 | 37 | self.scratch.output_conv = nn.Sequential( 38 | nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), 39 | Interpolate(scale_factor=2, mode="bilinear"), 40 | nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), 41 | nn.ReLU(True), 42 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 43 | nn.ReLU(True) if non_negative else nn.Identity(), 44 | ) 45 | 46 | if path: 47 | self.load(path) 48 | 49 | def forward(self, x): 50 | """Forward pass. 51 | 52 | Args: 53 | x (tensor): input data (image) 54 | 55 | Returns: 56 | tensor: depth 57 | """ 58 | 59 | layer_1 = self.pretrained.layer1(x) 60 | layer_2 = self.pretrained.layer2(layer_1) 61 | layer_3 = self.pretrained.layer3(layer_2) 62 | layer_4 = self.pretrained.layer4(layer_3) 63 | 64 | layer_1_rn = self.scratch.layer1_rn(layer_1) 65 | layer_2_rn = self.scratch.layer2_rn(layer_2) 66 | layer_3_rn = self.scratch.layer3_rn(layer_3) 67 | layer_4_rn = self.scratch.layer4_rn(layer_4) 68 | 69 | path_4 = self.scratch.refinenet4(layer_4_rn) 70 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 71 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 72 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 73 | 74 | out = self.scratch.output_conv(path_1) 75 | 76 | return torch.squeeze(out, dim=1) 77 | -------------------------------------------------------------------------------- /models/raft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/__init__.py -------------------------------------------------------------------------------- /models/raft/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .flow_viz import flow_to_image 2 | from .frame_utils import writeFlow 3 | -------------------------------------------------------------------------------- /models/raft/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/raft/utils/__pycache__/flow_viz.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/flow_viz.cpython-310.pyc -------------------------------------------------------------------------------- /models/raft/utils/__pycache__/frame_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/frame_utils.cpython-310.pyc -------------------------------------------------------------------------------- /models/raft/utils/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /models/raft/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from scipy import interpolate 5 | 6 | 7 | class InputPadder: 8 | """ Pads images such that dimensions are divisible by 8 """ 9 | def __init__(self, dims, mode='sintel'): 10 | self.ht, self.wd = dims[-2:] 11 | pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 12 | pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 13 | if mode == 'sintel': 14 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] 15 | else: 16 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] 17 | 18 | def pad(self, *inputs): 19 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 20 | 21 | def unpad(self,x): 22 | ht, wd = x.shape[-2:] 23 | c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] 24 | return x[..., c[0]:c[1], c[2]:c[3]] 25 | 26 | def forward_interpolate(flow): 27 | flow = flow.detach().cpu().numpy() 28 | dx, dy = flow[0], flow[1] 29 | 30 | ht, wd = dx.shape 31 | x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) 32 | 33 | x1 = x0 + dx 34 | y1 = y0 + dy 35 | 36 | x1 = x1.reshape(-1) 37 | y1 = y1.reshape(-1) 38 | dx = dx.reshape(-1) 39 | dy = dy.reshape(-1) 40 | 41 | valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) 42 | x1 = x1[valid] 43 | y1 = y1[valid] 44 | dx = dx[valid] 45 | dy = dy[valid] 46 | 47 | flow_x = interpolate.griddata( 48 | (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) 49 | 50 | flow_y = interpolate.griddata( 51 | (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) 52 | 53 | flow = np.stack([flow_x, flow_y], axis=0) 54 | return torch.from_numpy(flow).float() 55 | 56 | 57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False): 58 | """ Wrapper for grid_sample, uses pixel coordinates """ 59 | H, W = img.shape[-2:] 60 | xgrid, ygrid = coords.split([1,1], dim=-1) 61 | xgrid = 2*xgrid/(W-1) - 1 62 | ygrid = 2*ygrid/(H-1) - 1 63 | 64 | grid = torch.cat([xgrid, ygrid], dim=-1) 65 | img = F.grid_sample(img, grid, align_corners=True) 66 | 67 | if mask: 68 | mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) 69 | return img, mask.float() 70 | 71 | return img 72 | 73 | 74 | def coords_grid(batch, ht, wd): 75 | coords = torch.meshgrid(torch.arange(ht), torch.arange(wd)) 76 | coords = torch.stack(coords[::-1], dim=0).float() 77 | return coords[None].repeat(batch, 1, 1, 1) 78 | 79 | 80 | def upflow8(flow, mode='bilinear'): 81 | new_size = (8 * flow.shape[2], 8 * flow.shape[3]) 82 | return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) 83 | -------------------------------------------------------------------------------- /models/u2net/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/u2net/__init__.py -------------------------------------------------------------------------------- /runners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/__init__.py -------------------------------------------------------------------------------- /runners/instructpix2pix_inference_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import torch 4 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler 5 | 6 | 7 | def instructpix2pix_inference_runner(args): 8 | device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" 9 | 10 | # Weight dtype 11 | weight_dtype = torch.float32 12 | if args.mixed_precision == "fp16": 13 | weight_dtype = torch.float16 14 | elif args.mixed_precision == "bf16": 15 | weight_dtype = torch.bfloat16 16 | 17 | # Define pipeline 18 | pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( 19 | args.instructpix2pix_checkpoint_path, 20 | torch_dtype=weight_dtype, 21 | safety_checker=None) 22 | pipeline.to(device) 23 | pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) 24 | 25 | # Load the first-frame input 26 | input_list = sorted(os.listdir(args.source_video_frames)) 27 | image = Image.open(os.path.join(args.source_video_frames, input_list[0])) 28 | image = image.resize((args.height, args.width)) 29 | 30 | # Forward 31 | images = pipeline(args.external_guidance, 32 | image=image, 33 | seed=args.seed, 34 | guidance_scale=args.guidance_scale, 35 | negative_prompt=args.negative_prompt, 36 | num_inference_steps=args.num_inference_steps, 37 | image_guidance_scale=args.image_guidance_scale).images[0] 38 | 39 | # Save image 40 | save_path = os.path.join(args.outdir, 'image_editing_results') 41 | os.makedirs(save_path, exist_ok=True) 42 | filename = args.prompt.lower().replace('.', '').replace(' ', '_') 43 | images.save(os.path.join(save_path, f'{filename}.png')) -------------------------------------------------------------------------------- /runners/iterative_warping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__init__.py -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/get_averaged_depths.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/get_averaged_depths.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/run_flow_extraction.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_flow_extraction.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/run_torch_average_flow_warping.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_torch_average_flow_warping.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/run_warp_with_averaged_flow.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_warp_with_averaged_flow.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/__pycache__/warp_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/warp_utils.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/get_averaged_depths.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import torch 4 | import os 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | def load_image(path): 9 | img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 10 | return torch.from_numpy(img.astype(np.float32) / 255.).unsqueeze(0).unsqueeze(0) 11 | 12 | def get_averaged_depths_main_func(args): 13 | depth_dir = os.path.join(args.outdir, 'iterative_warping', 'depth_maps') 14 | output_dir = os.path.join(args.outdir, 'iterative_warping', 'averaged_depths') 15 | object_mask_dir = os.path.join(args.outdir, 'iterative_warping', 'object_masks') 16 | editing_mask_dir = os.path.join(args.outdir, 'iterative_warping', 'warped_masks') 17 | os.makedirs(output_dir, exist_ok=True) 18 | 19 | # Get sorted lists of all files in each input directory 20 | depth_files = sorted([f for f in os.listdir(depth_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames] 21 | object_mask_files = sorted([f for f in os.listdir(object_mask_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames] 22 | editing_mask_files = sorted([f for f in os.listdir(editing_mask_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames] 23 | 24 | for i, (depth_file, object_mask_file, editing_mask_file) in enumerate(tqdm(zip(depth_files, object_mask_files, editing_mask_files), total=len(depth_files))): 25 | # 1. Load depth map 26 | depth_map = load_image(os.path.join(depth_dir, depth_file)) 27 | _, _, H, W = depth_map.shape 28 | 29 | # 2. Load object mask 30 | object_mask = load_image(os.path.join(object_mask_dir, object_mask_file)) 31 | object_mask = cv2.resize(object_mask.squeeze().numpy(), (W, H)) 32 | object_mask = torch.from_numpy(object_mask).unsqueeze(0).unsqueeze(0) 33 | object_mask[object_mask > 0.5] = 1 34 | object_mask[object_mask <= 0.5] = 0 35 | 36 | # 3. Load editing mask 37 | editing_mask = load_image(os.path.join(editing_mask_dir, editing_mask_file)) 38 | editing_mask = cv2.resize(editing_mask.squeeze().numpy(), (W, H)) 39 | editing_mask = torch.from_numpy(editing_mask).unsqueeze(0).unsqueeze(0) 40 | editing_mask[editing_mask > 0.5] = 1 41 | editing_mask[editing_mask <= 0.5] = 0 42 | 43 | # 4. Compute average depth within the object mask 44 | object_masked_depth = object_mask * depth_map 45 | total_depth = object_masked_depth.sum() 46 | num_pixels = object_mask.sum() 47 | average_depth = total_depth / num_pixels 48 | 49 | # 5. Apply average depth to depths within the editing mask 50 | averaged_depth_map = torch.where(editing_mask == 1, average_depth, depth_map) 51 | 52 | # 6. Save the result 53 | output_depth = (averaged_depth_map.squeeze().cpu().numpy() * 255).astype(np.uint8) 54 | output_filename = f'{i:05d}.png' 55 | cv2.imwrite(os.path.join(output_dir, output_filename), output_depth) 56 | -------------------------------------------------------------------------------- /runners/iterative_warping/get_editing_region.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import os 4 | 5 | def get_editing_region(src_mask, approximate_mask): 6 | src_mask = src_mask / 255. 7 | approximate_mask = approximate_mask / 255. 8 | editing_region = src_mask * (1 - approximate_mask) 9 | return (editing_region * 255.).astype('uint8') 10 | 11 | def main(args): 12 | # Create output directory if it doesn't exist 13 | os.makedirs(args.output_dir, exist_ok=True) 14 | 15 | # Get list of all files in the source mask directory 16 | src_mask_files = sorted([f for f in os.listdir(args.src_mask_dir) if f.endswith('.png')]) 17 | approx_mask_files = sorted([f for f in os.listdir(args.approx_mask_dir) if f.endswith('.png')]) 18 | 19 | for count, (src_filename, approx_filename) in enumerate(zip(src_mask_files, approx_mask_files)): 20 | # Load source mask 21 | src_mask_path = os.path.join(args.src_mask_dir, src_filename) 22 | src_mask = cv2.imread(src_mask_path) 23 | 24 | # Load approximate mask 25 | approx_mask_path = os.path.join(args.approx_mask_dir, approx_filename) 26 | approximate_mask = cv2.imread(approx_mask_path) 27 | 28 | # Resize masks to the same size 29 | if src_mask.shape != approximate_mask.shape: 30 | height, width = src_mask.shape[:2] 31 | approximate_mask = cv2.resize(approximate_mask, (width, height), interpolation=cv2.INTER_NEAREST) 32 | 33 | # Get editing region 34 | editing_region = get_editing_region(src_mask, approximate_mask) 35 | 36 | # Save editing region mask 37 | output_path = os.path.join(args.output_dir, f'{count:05d}.png') 38 | cv2.imwrite(output_path, editing_region) 39 | 40 | print(f"Progress: {count + 1}") 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser(description="Generate editing region masks for video frames") 44 | parser.add_argument("--src-mask-dir", type=str, required=True, help="Directory containing source mask frames") 45 | parser.add_argument("--approx-mask-dir", type=str, required=True, help="Directory containing approximate mask frames") 46 | parser.add_argument("--output-dir", type=str, default="editing_regions", help="Output directory for editing region masks") 47 | 48 | args = parser.parse_args() 49 | main(args) -------------------------------------------------------------------------------- /runners/iterative_warping/raft/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, princeton-vl 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/README.md: -------------------------------------------------------------------------------- 1 | # RAFT 2 | This repository contains the source code for our paper: 3 | 4 | [RAFT: Recurrent All Pairs Field Transforms for Optical Flow](https://arxiv.org/pdf/2003.12039.pdf)
5 | ECCV 2020
6 | Zachary Teed and Jia Deng
7 | 8 | 9 | 10 | ## Requirements 11 | The code has been tested with PyTorch 1.6 and Cuda 10.1. 12 | ```Shell 13 | conda create --name raft 14 | conda activate raft 15 | conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 matplotlib tensorboard scipy opencv -c pytorch 16 | ``` 17 | 18 | ## Demos 19 | Pretrained models can be downloaded by running 20 | ```Shell 21 | ./download_models.sh 22 | ``` 23 | or downloaded from [google drive](https://drive.google.com/drive/folders/1sWDsfuZ3Up38EUQt7-JDTT1HcGHuJgvT?usp=sharing) 24 | 25 | You can demo a trained model on a sequence of frames 26 | ```Shell 27 | python demo.py --model=models/raft-things.pth --path=demo-frames 28 | ``` 29 | 30 | ## Required Data 31 | To evaluate/train RAFT, you will need to download the required datasets. 32 | * [FlyingChairs](https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html#flyingchairs) 33 | * [FlyingThings3D](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html) 34 | * [Sintel](http://sintel.is.tue.mpg.de/) 35 | * [KITTI](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=flow) 36 | * [HD1K](http://hci-benchmark.iwr.uni-heidelberg.de/) (optional) 37 | 38 | 39 | By default `datasets.py` will search for the datasets in these locations. You can create symbolic links to wherever the datasets were downloaded in the `datasets` folder 40 | 41 | ```Shell 42 | ├── datasets 43 | ├── Sintel 44 | ├── test 45 | ├── training 46 | ├── KITTI 47 | ├── testing 48 | ├── training 49 | ├── devkit 50 | ├── FlyingChairs_release 51 | ├── data 52 | ├── FlyingThings3D 53 | ├── frames_cleanpass 54 | ├── frames_finalpass 55 | ├── optical_flow 56 | ``` 57 | 58 | ## Evaluation 59 | You can evaluate a trained model using `evaluate.py` 60 | ```Shell 61 | python evaluate.py --model=models/raft-things.pth --dataset=sintel --mixed_precision 62 | ``` 63 | 64 | ## Training 65 | We used the following training schedule in our paper (2 GPUs). Training logs will be written to the `runs` which can be visualized using tensorboard 66 | ```Shell 67 | ./train_standard.sh 68 | ``` 69 | 70 | If you have a RTX GPU, training can be accelerated using mixed precision. You can expect similiar results in this setting (1 GPU) 71 | ```Shell 72 | ./train_mixed.sh 73 | ``` 74 | 75 | ## (Optional) Efficent Implementation 76 | You can optionally use our alternate (efficent) implementation by compiling the provided cuda extension 77 | ```Shell 78 | cd alt_cuda_corr && python setup.py install && cd .. 79 | ``` 80 | and running `demo.py` and `evaluate.py` with the `--alternate_corr` flag Note, this implementation is somewhat slower than all-pairs, but uses significantly less GPU memory during the forward pass. 81 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/alt_cuda_corr/correlation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA forward declarations 5 | std::vector corr_cuda_forward( 6 | torch::Tensor fmap1, 7 | torch::Tensor fmap2, 8 | torch::Tensor coords, 9 | int radius); 10 | 11 | std::vector corr_cuda_backward( 12 | torch::Tensor fmap1, 13 | torch::Tensor fmap2, 14 | torch::Tensor coords, 15 | torch::Tensor corr_grad, 16 | int radius); 17 | 18 | // C++ interface 19 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 20 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 21 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 22 | 23 | std::vector corr_forward( 24 | torch::Tensor fmap1, 25 | torch::Tensor fmap2, 26 | torch::Tensor coords, 27 | int radius) { 28 | CHECK_INPUT(fmap1); 29 | CHECK_INPUT(fmap2); 30 | CHECK_INPUT(coords); 31 | 32 | return corr_cuda_forward(fmap1, fmap2, coords, radius); 33 | } 34 | 35 | 36 | std::vector corr_backward( 37 | torch::Tensor fmap1, 38 | torch::Tensor fmap2, 39 | torch::Tensor coords, 40 | torch::Tensor corr_grad, 41 | int radius) { 42 | CHECK_INPUT(fmap1); 43 | CHECK_INPUT(fmap2); 44 | CHECK_INPUT(coords); 45 | CHECK_INPUT(corr_grad); 46 | 47 | return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius); 48 | } 49 | 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", &corr_forward, "CORR forward"); 53 | m.def("backward", &corr_backward, "CORR backward"); 54 | } -------------------------------------------------------------------------------- /runners/iterative_warping/raft/alt_cuda_corr/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | 5 | setup( 6 | name='correlation', 7 | ext_modules=[ 8 | CUDAExtension('alt_cuda_corr', 9 | sources=['correlation.cpp', 'correlation_kernel.cu'], 10 | extra_compile_args={'cxx': [], 'nvcc': ['-O3']}), 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | 16 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__init__.py -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__pycache__/corr.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/corr.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__pycache__/extractor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/extractor.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__pycache__/raft.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/raft.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/__pycache__/update.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/update.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__init__.py -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/utils/__pycache__/flow_viz.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/flow_viz.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/utils/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /runners/iterative_warping/raft/core/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from scipy import interpolate 5 | 6 | 7 | class InputPadder: 8 | """ Pads images such that dimensions are divisible by 8 """ 9 | def __init__(self, dims, mode='sintel'): 10 | self.ht, self.wd = dims[-2:] 11 | pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 12 | pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 13 | if mode == 'sintel': 14 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] 15 | else: 16 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] 17 | 18 | def pad(self, *inputs): 19 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 20 | 21 | def unpad(self,x): 22 | ht, wd = x.shape[-2:] 23 | c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] 24 | return x[..., c[0]:c[1], c[2]:c[3]] 25 | 26 | def forward_interpolate(flow): 27 | flow = flow.detach().cpu().numpy() 28 | dx, dy = flow[0], flow[1] 29 | 30 | ht, wd = dx.shape 31 | x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) 32 | 33 | x1 = x0 + dx 34 | y1 = y0 + dy 35 | 36 | x1 = x1.reshape(-1) 37 | y1 = y1.reshape(-1) 38 | dx = dx.reshape(-1) 39 | dy = dy.reshape(-1) 40 | 41 | valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) 42 | x1 = x1[valid] 43 | y1 = y1[valid] 44 | dx = dx[valid] 45 | dy = dy[valid] 46 | 47 | flow_x = interpolate.griddata( 48 | (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) 49 | 50 | flow_y = interpolate.griddata( 51 | (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) 52 | 53 | flow = np.stack([flow_x, flow_y], axis=0) 54 | return torch.from_numpy(flow).float() 55 | 56 | 57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False): 58 | """ Wrapper for grid_sample, uses pixel coordinates """ 59 | H, W = img.shape[-2:] 60 | xgrid, ygrid = coords.split([1,1], dim=-1) 61 | xgrid = 2*xgrid/(W-1) - 1 62 | ygrid = 2*ygrid/(H-1) - 1 63 | 64 | grid = torch.cat([xgrid, ygrid], dim=-1) 65 | img = F.grid_sample(img, grid, align_corners=True) 66 | 67 | if mask: 68 | mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) 69 | return img, mask.float() 70 | 71 | return img 72 | 73 | 74 | def coords_grid(batch, ht, wd, device): 75 | coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device)) 76 | coords = torch.stack(coords[::-1], dim=0).float() 77 | return coords[None].repeat(batch, 1, 1, 1) 78 | 79 | 80 | def upflow8(flow, mode='bilinear'): 81 | new_size = (8 * flow.shape[2], 8 * flow.shape[3]) 82 | return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) 83 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/demo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | 4 | import argparse 5 | import os 6 | import cv2 7 | import glob 8 | import numpy as np 9 | import torch 10 | from PIL import Image 11 | 12 | from raft import RAFT 13 | from utils import flow_viz 14 | from utils.utils import InputPadder 15 | 16 | 17 | 18 | DEVICE = 'cpu' 19 | 20 | def load_image(imfile): 21 | img = np.array(Image.open(imfile)).astype(np.uint8) 22 | img = torch.from_numpy(img).permute(2, 0, 1).float() 23 | return img[None].to(DEVICE) 24 | 25 | 26 | def viz(img, flo, count): 27 | img = img[0].permute(1,2,0).cpu().numpy() 28 | flo = flo[0].permute(1,2,0).cpu().numpy() 29 | 30 | # map flow to rgb image 31 | flo = flow_viz.flow_to_image(flo) 32 | img_flo = np.concatenate([img, flo], axis=0) 33 | 34 | # import matplotlib.pyplot as plt 35 | # plt.imshow(img_flo / 255.0) 36 | # plt.show() 37 | 38 | cv2.imwrite(f'outputs/visualization/{count:05d}.png', img_flo[:, :, [2,1,0]]) # /255.0 39 | 40 | 41 | def demo(args): 42 | model = torch.nn.DataParallel(RAFT(args)) 43 | model.load_state_dict(torch.load(args.model, map_location='cpu')) 44 | 45 | model = model.module 46 | model.to(DEVICE) 47 | model.eval() 48 | 49 | with torch.no_grad(): 50 | images = glob.glob(os.path.join(args.W, '*.png')) + \ 51 | glob.glob(os.path.join(args.path, '*.jpg')) 52 | 53 | images = sorted(images) 54 | count = 0 55 | for imfile1, imfile2 in zip(images[:-1], images[1:]): 56 | image1 = load_image(imfile1) 57 | image2 = load_image(imfile2) 58 | 59 | padder = InputPadder(image1.shape) 60 | image1, image2 = padder.pad(image1, image2) 61 | 62 | flow_low, flow_up = model(image1, image2, iters=20, test_mode=True) 63 | np.save(os.path.join('outputs/optical-flow-up', f'{count:05d}'), flow_up.cpu()) 64 | np.save(os.path.join('outputs/optical-flow-low', f'{count:05d}'), flow_low.cpu()) 65 | viz(image1, flow_up, count) 66 | count += 1 67 | print(f'Progress: {count}/{len(images)}') 68 | 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--model', help="restore checkpoint") 73 | parser.add_argument('--path', help="dataset for evaluation") 74 | parser.add_argument('--small', action='store_true', help='use small model') 75 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 76 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 77 | args = parser.parse_args() 78 | 79 | os.makedirs('outputs/visualization', exist_ok=True) 80 | os.makedirs('outputs/optical-flow-up', exist_ok=True) 81 | os.makedirs('outputs/optical-flow-low', exist_ok=True) 82 | demo(args) 83 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip 3 | unzip models.zip 4 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/extract-flow-from-frames.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | 4 | import argparse 5 | import os 6 | import cv2 7 | import glob 8 | import numpy as np 9 | import torch 10 | from PIL import Image 11 | 12 | from raft import RAFT 13 | from utils import flow_viz 14 | from utils.utils import InputPadder 15 | 16 | 17 | 18 | DEVICE = 'cpu' 19 | 20 | def load_image(imfile): 21 | img = np.array(Image.open(imfile)).astype(np.uint8) 22 | img = torch.from_numpy(img).permute(2, 0, 1).float() 23 | return img[None].to(DEVICE) 24 | 25 | 26 | def viz(img, flo, outdir, index): 27 | img = img[0].permute(1,2,0).cpu().numpy() 28 | flo = flo[0].permute(1,2,0).cpu().numpy() 29 | 30 | # map flow to rgb image 31 | flo = flow_viz.flow_to_image(flo) 32 | img_flo = np.concatenate([img, flo], axis=0) 33 | 34 | # import matplotlib.pyplot as plt 35 | # plt.imshow(img_flo / 255.0) 36 | # plt.show() 37 | 38 | cv2.imwrite(os.path.join(outdir, 'visualization', f'{index:05d}.png'), img_flo[:, :, [2,1,0]]) # /255.0 39 | 40 | 41 | def demo(args): 42 | # 0. Define RAFT model 43 | model = torch.nn.DataParallel(RAFT(args)) 44 | model.load_state_dict(torch.load(args.model, map_location='cpu')) 45 | 46 | model = model.module 47 | model.to(DEVICE) 48 | model.eval() 49 | 50 | # 1. Load in frames path 51 | frames_path = [] 52 | for path in sorted(os.listdir(args.path)): 53 | frames_path.append(os.path.join(args.path, path)) 54 | 55 | 56 | # 2. Start extracting optical flows 57 | with torch.no_grad(): 58 | for index in range(len(frames_path)): 59 | if index + 1 < len(frames_path): 60 | image1 = load_image(frames_path[index + 1]) 61 | image2 = load_image(frames_path[index]) 62 | 63 | padder = InputPadder(image1.shape) 64 | image1, image2 = padder.pad(image1, image2) 65 | 66 | flow_low, flow_up = model(image1, image2, iters=20, test_mode=True) 67 | np.save(os.path.join(args.outdir, 'flow-up', f'{index:05d}'), flow_up.cpu()) 68 | np.save(os.path.join(args.outdir, 'flow-low', f'{index:05d}'), flow_low.cpu()) 69 | viz(image1, flow_up, args.outdir, index) 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--model', default='/Users/liuchang/Desktop/Workspaces/checkpoints/raft/raft-things.pth', help="restore checkpoint") 75 | parser.add_argument('--path', type=str, help='path of video frames') 76 | parser.add_argument('--outdir', type=str, default='outputs', help='output directory') 77 | parser.add_argument('--small', action='store_true', help='use small model') 78 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 79 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 80 | args = parser.parse_args() 81 | 82 | os.makedirs(args.outdir, exist_ok=True) 83 | os.makedirs(os.path.join(args.outdir, 'visualization'), exist_ok=True) 84 | os.makedirs(os.path.join(args.outdir, 'flow-up'), exist_ok=True) 85 | os.makedirs(os.path.join(args.outdir, 'flow-low'), exist_ok=True) 86 | demo(args) 87 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/extract-flow-from-two-images.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | 4 | import argparse 5 | import os 6 | import cv2 7 | import glob 8 | import numpy as np 9 | import torch 10 | from PIL import Image 11 | 12 | from raft import RAFT 13 | from utils import flow_viz 14 | from utils.utils import InputPadder 15 | 16 | 17 | 18 | DEVICE = 'cpu' 19 | 20 | def load_image(imfile): 21 | img = np.array(Image.open(imfile)).astype(np.uint8) 22 | img = torch.from_numpy(img).permute(2, 0, 1).float() 23 | return img[None].to(DEVICE) 24 | 25 | 26 | def viz(img, flo): 27 | img = img[0].permute(1,2,0).cpu().numpy() 28 | flo = flo[0].permute(1,2,0).cpu().numpy() 29 | 30 | # map flow to rgb image 31 | flo = flow_viz.flow_to_image(flo) 32 | img_flo = np.concatenate([img, flo], axis=0) 33 | 34 | # import matplotlib.pyplot as plt 35 | # plt.imshow(img_flo / 255.0) 36 | # plt.show() 37 | 38 | cv2.imwrite(f'outputs/visualization.png', img_flo[:, :, [2,1,0]]) # /255.0 39 | 40 | 41 | def demo(args): 42 | model = torch.nn.DataParallel(RAFT(args)) 43 | model.load_state_dict(torch.load(args.model, map_location='cpu')) 44 | 45 | model = model.module 46 | model.to(DEVICE) 47 | model.eval() 48 | 49 | with torch.no_grad(): 50 | image1 = load_image(args.image1) 51 | image2 = load_image(args.image2) 52 | 53 | padder = InputPadder(image1.shape) 54 | image1, image2 = padder.pad(image1, image2) 55 | 56 | flow_low, flow_up = model(image1, image2, iters=20, test_mode=True) 57 | np.save(os.path.join('outputs', 'flow-up'), flow_up.cpu()) 58 | np.save(os.path.join('outputs', 'flow-low'), flow_low.cpu()) 59 | viz(image1, flow_up) 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--model', default='/Users/liuchang/Desktop/Workspaces/checkpoints/raft/raft-things.pth', help="restore checkpoint") 65 | parser.add_argument('--image1', type=str, help='the first image') 66 | parser.add_argument('--image2', type=str, help='the second image') 67 | parser.add_argument('--small', action='store_true', help='use small model') 68 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 69 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 70 | args = parser.parse_args() 71 | 72 | os.makedirs('outputs', exist_ok=True) 73 | demo(args) 74 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/train_mixed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p checkpoints 3 | python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 --num_steps 120000 --batch_size 8 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --mixed_precision 4 | python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 400 720 --wdecay 0.0001 --mixed_precision 5 | python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 368 768 --wdecay 0.00001 --gamma=0.85 --mixed_precision 6 | python -u train.py --name raft-kitti --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 --num_steps 50000 --batch_size 5 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85 --mixed_precision 7 | -------------------------------------------------------------------------------- /runners/iterative_warping/raft/train_standard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p checkpoints 3 | python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 1 --num_steps 100000 --batch_size 10 --lr 0.0004 --image_size 368 496 --wdecay 0.0001 4 | python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 400 720 --wdecay 0.0001 5 | python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma=0.85 6 | python -u train.py --name raft-kitti --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 1 --num_steps 50000 --batch_size 6 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85 7 | -------------------------------------------------------------------------------- /runners/iterative_warping/run_extract_images_depths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from PIL import Image 4 | from tqdm import tqdm 5 | from models.midas.midas import DepthMidas 6 | import torch 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser(description="Extract depth maps from images") 10 | parser.add_argument("--device", default='cuda' if torch.cuda.is_available() else 'cpu', 11 | help="Device to use for computation") 12 | parser.add_argument("--midas_path", default='/Users/liuchang/Desktop/Workspaces/checkpoints/dpt_swin2_large_384.pt', 13 | help="Path to MiDaS model") 14 | parser.add_argument("--input_dir", default='inpainted_outputs', 15 | help="Directory containing input images") 16 | parser.add_argument("--output_dir", default='experimental_scripts/output_depth_examples', 17 | help="Directory to save output depth maps") 18 | return parser.parse_args() 19 | 20 | def main(args): 21 | os.makedirs(args.output_dir, exist_ok=True) 22 | 23 | depth_estimator = DepthMidas(model_path=args.midas_path, device=args.device) 24 | 25 | # Get all image files 26 | image_files = [f for f in os.listdir(args.input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))] 27 | 28 | progress_bar = tqdm(total=len(image_files)) 29 | for image_file in image_files: 30 | progress_bar.update(1) 31 | 32 | # Load image 33 | image_path = os.path.join(args.input_dir, image_file) 34 | image = Image.open(image_path).convert('RGB') 35 | 36 | # Estimate depth 37 | depth = depth_estimator.estimate([image])[0] 38 | 39 | # Save depth map 40 | output_path = os.path.join(args.output_dir, f"{image_file}") 41 | depth.save(output_path) 42 | 43 | progress_bar.close() 44 | print("All images processed.") 45 | 46 | if __name__ == "__main__": 47 | args = parse_args() 48 | main(args) -------------------------------------------------------------------------------- /runners/iterative_warping/run_flow_extraction.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | 4 | import os 5 | import cv2 6 | import numpy as np 7 | import torch 8 | from PIL import Image 9 | 10 | from models.raft.raft import RAFT 11 | from runners.iterative_warping.raft.core.utils.flow_viz import flow_to_image 12 | from runners.iterative_warping.raft.core.utils.utils import InputPadder 13 | 14 | 15 | 16 | device = "cuda" if torch.cuda.is_available() else "cpu" 17 | 18 | def load_image(imfile): 19 | img = np.array(Image.open(imfile).convert('RGB')).astype(np.uint8) 20 | img = cv2.resize(img, (512, 512)) 21 | img = torch.from_numpy(img).permute(2, 0, 1).float() 22 | return img[None].to(device) 23 | 24 | 25 | def viz(img, flo, outdir, index): 26 | img = img[0].permute(1,2,0).cpu().numpy() 27 | flo = flo[0].permute(1,2,0).cpu().numpy() 28 | 29 | # map flow to rgb image 30 | flo = flow_to_image(flo) 31 | img_flo = np.concatenate([img, flo], axis=0) 32 | 33 | # import matplotlib.pyplot as plt 34 | # plt.imshow(img_flo / 255.0) 35 | # plt.show() 36 | 37 | cv2.imwrite(os.path.join(outdir, 'visualization', f'{index:05d}.png'), img_flo[:, :, [2,1,0]]) # /255.0 38 | 39 | 40 | def raft_flow_extraction_runner(args): 41 | # 0. Define RAFT model 42 | model = torch.nn.DataParallel(RAFT()) 43 | model.load_state_dict(torch.load(args.raft_checkpoint_path, map_location='cpu')) 44 | 45 | model = model.module 46 | model.to(device) 47 | model.eval() 48 | os.makedirs(os.path.join(args.outdir, 'iterative_warping', 'optical_flows'), exist_ok=True) 49 | 50 | # 1. Load in frames path 51 | frames_path = [] 52 | for path in sorted(os.listdir(args.source_video_frames)): 53 | frames_path.append(os.path.join(args.source_video_frames, path)) 54 | 55 | 56 | # 2. Start extracting optical flows 57 | with torch.no_grad(): 58 | for index in range(min(args.n_sample_frames, len(frames_path))): 59 | if index + 1 < len(frames_path): 60 | image1 = load_image(frames_path[index + 1]) 61 | image2 = load_image(frames_path[index]) 62 | 63 | padder = InputPadder(image1.shape) 64 | image1, image2 = padder.pad(image1, image2) 65 | 66 | flow_low, flow_up = model(image1, image2, iters=20, test_mode=True) 67 | np.save(os.path.join(args.outdir, 'iterative_warping', 'optical_flows', f'{index:05d}'), flow_up.cpu()) 68 | -------------------------------------------------------------------------------- /runners/iterative_warping/run_warp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import torch 4 | import os 5 | import numpy as np 6 | from typing import List 7 | from io import BytesIO 8 | from runners.iterative_warping.warp_utils import optical_flow_warping 9 | 10 | 11 | def images_to_gif_bytes(images: List, duration: int = 1000) -> bytes: 12 | with BytesIO() as output_buffer: 13 | # Save the first image 14 | images[0].save(output_buffer, 15 | format='GIF', 16 | save_all=True, 17 | append_images=images[1:], 18 | duration=duration, 19 | loop=0) # 0 means the GIF will loop indefinitely 20 | 21 | # Get the byte array from the buffer 22 | gif_bytes = output_buffer.getvalue() 23 | 24 | return gif_bytes 25 | 26 | 27 | def save_as_gif(images: List, file_path: str, duration: int = 1000): 28 | with open(file_path, "wb") as f: 29 | f.write(images_to_gif_bytes(images, duration)) 30 | 31 | def warp(init_frame_path, flows): 32 | 33 | # 2. Load in initial frame 34 | init_frame = cv2.imread(init_frame_path) 35 | init_frame = cv2.resize(init_frame, (W, H)) # Resize to make sure that resolution is aligned 36 | init_frame = init_frame / 255.0 37 | init_frame = torch.from_numpy(init_frame).float() 38 | init_frame = init_frame.permute(2, 0, 1).unsqueeze(0) 39 | 40 | # 3. Warping 41 | warped_frames = [] 42 | for index in range(len(optical_flows)): 43 | current_frame = init_frame if index == 0 else warped_frame_tensor 44 | if len(current_frame.shape) == 3: 45 | current_frame = current_frame.unsqueeze(0) 46 | warped_frame_tensor = optical_flow_warping(current_frame, optical_flows[index])[0] 47 | warped_frame = warped_frame_tensor.permute(1, 2, 0).numpy() 48 | warped_frames.append(warped_frame * 255) 49 | cv2.imwrite(os.path.join(args.outdir, f'{index:05d}.png'), warped_frame * 255) 50 | 51 | # TODO: 4. Save gif output 52 | # pil_warped_frames = [] 53 | # for warped_frame in warped_frames: 54 | # pil_warped_frame = Image.fromarray(warped_frame) 55 | # pil_warped_frames.append(pil_warped_frame) 56 | # save_as_gif(pil_warped_frames, os.path.join(args.outdir, 'result.gif')) 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument('--init-frame', type=str, default='') 61 | parser.add_argument('--optical-flow', type=str, default='') 62 | parser.add_argument('--outdir', type=str, default='warped-outputs') 63 | args = parser.parse_args() 64 | 65 | # 0. Create directories 66 | os.makedirs(args.outdir, exist_ok=True) 67 | 68 | # 1. Load in pre-extracted optical flows 69 | optical_flow_paths = os.listdir(args.optical_flow) 70 | optical_flows = [] 71 | for optical_flow_path in optical_flow_paths: 72 | optical_flow = np.load(os.path.join(args.optical_flow, optical_flow_path)) 73 | # optical_flow = cv2.medianBlur(optical_flow, ksize=23) 74 | optical_flow = torch.from_numpy(optical_flow) 75 | optical_flows.append(optical_flow) 76 | _, C, H, W = optical_flows[0].shape 77 | 78 | 79 | warp(init_frame_path=args.init_frame, 80 | flows=optical_flows) -------------------------------------------------------------------------------- /runners/iterative_warping_runner.py: -------------------------------------------------------------------------------- 1 | from runners.iterative_warping.run_warp_with_averaged_flow import iterative_warp_with_averaged_flow 2 | from runners.iterative_warping.get_averaged_depths import get_averaged_depths_main_func 3 | 4 | def iterative_warping_runner(args): 5 | # 1. Get averaged flows 6 | iterative_warp_with_averaged_flow(args) 7 | 8 | # 2. Get averaged depths 9 | get_averaged_depths_main_func(args) 10 | 11 | -------------------------------------------------------------------------------- /runners/midas_depth_estimation_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from PIL import Image 4 | from tqdm import tqdm 5 | from models.midas.midas import DepthMidas 6 | 7 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 8 | 9 | def midas_depth_estimation_runner(args): 10 | depth_dir = os.path.join(args.outdir, 'iterative_warping', 'depth_maps') 11 | os.makedirs(depth_dir, exist_ok=True) 12 | 13 | depth_estimator = DepthMidas(model_path=args.midas_checkpoint_path, device=device) 14 | 15 | # Get all image files 16 | image_files = [f for f in sorted(os.listdir(args.source_video_frames))[:args.n_sample_frames] if f.lower().endswith(('.png', '.jpg', '.jpeg'))] 17 | 18 | progress_bar = tqdm(total=len(image_files)) 19 | for image_file in image_files: 20 | progress_bar.update(1) 21 | 22 | # Load image 23 | image_path = os.path.join(args.source_video_frames, image_file) 24 | image = Image.open(image_path).convert('RGB') 25 | 26 | # Estimate depth 27 | depth = depth_estimator.estimate([image])[0] 28 | 29 | # Save depth map 30 | output_path = os.path.join(depth_dir, f"{image_file}") 31 | depth.save(output_path) 32 | 33 | progress_bar.close() 34 | 35 | -------------------------------------------------------------------------------- /runners/paint_by_example_inference_runner.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import PIL 4 | import torch 5 | import numpy as np 6 | from diffusers import PaintByExamplePipeline 7 | 8 | device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" 9 | 10 | def paint_by_example_inference_runner(args): 11 | model_path = args.paint_by_example_checkpoint_path 12 | reference_image_path = args.external_guidance 13 | outdir = args.outdir 14 | height = args.height 15 | width = args.width 16 | 17 | # Create output directory if not existed 18 | os.makedirs(outdir, exist_ok=True) 19 | 20 | # Prepare inputs 21 | image_path = sorted(os.listdir(args.source_video_frames))[0] 22 | mask_path = sorted(os.listdir(args.input_masks))[0] 23 | init_image = PIL.Image.open(os.path.join(args.source_video_frames, image_path)).resize((height, width)) 24 | mask_image = PIL.Image.open(os.path.join(args.input_masks, mask_path)).resize((height, width)) 25 | reference_image = PIL.Image.open(reference_image_path).resize((height, width)) 26 | 27 | 28 | # Dilate the mask to ensure that it covers the original object 29 | mask_np = np.array(mask_image) 30 | kernel = np.ones((args.kernel_size, args.kernel_size), np.uint8) 31 | dilated_mask = cv2.dilate(mask_np, kernel, iterations=args.dilation_iteration) 32 | mask_image = PIL.Image.fromarray(dilated_mask) 33 | 34 | 35 | 36 | # Prepare pipeline 37 | torch_dtype = torch.float32 38 | if args.mixed_precision == "fp32": 39 | torch_dtype = torch.float32 40 | elif args.mixed_precision == "fp16": 41 | torch_dtype = torch.float16 42 | elif args.mixed_precision == "bf16": 43 | torch_dtype = torch.bfloat16 44 | pipe = PaintByExamplePipeline.from_pretrained( 45 | model_path, 46 | torch_dtype=torch_dtype, 47 | ) 48 | pipe = pipe.to(device) 49 | 50 | # Send inputs into the pipeline 51 | image = pipe(image=init_image, 52 | mask_image=mask_image, 53 | example_image=reference_image, 54 | guidance_scale=args.guidance_scale, 55 | negative_prompt=args.negative_prompt).images[0] 56 | 57 | # Save image 58 | save_path = os.path.join(outdir, 'image_editing_results') 59 | os.makedirs(save_path, exist_ok=True) 60 | filename = args.prompt.lower().replace('.', '').replace(' ', '_') 61 | image.save(os.path.join(save_path, f'{filename}.png')) 62 | -------------------------------------------------------------------------------- /runners/stable_diffusion_inpaint_inference_runner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import cv2 4 | import numpy as np 5 | from PIL import Image 6 | from diffusers import StableDiffusionInpaintPipeline 7 | 8 | 9 | 10 | def stable_diffusion_inpaint_inference_runner(args): 11 | device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" 12 | 13 | # Define weight dtype 14 | if args.mixed_precision == "fp16": 15 | weight_dtype = torch.float16 16 | elif args.mixed_precision == "bf16": 17 | weight_dtype = torch.bfloat16 18 | else: 19 | weight_dtype = torch.float32 20 | 21 | pipe = StableDiffusionInpaintPipeline.from_pretrained( 22 | args.stable_diffusion_inpaint_checkpoint_path, 23 | torch_dtype=weight_dtype, 24 | safety_checker=None, 25 | ) 26 | pipe = pipe.to(device, dtype=weight_dtype) 27 | 28 | # Load the first frame from the source video frames 29 | image = Image.open(os.path.join(args.source_video_frames, sorted(os.listdir(args.source_video_frames))[0])).convert("RGB") 30 | # Load the first mask from the input masks 31 | mask = Image.open(os.path.join(args.input_masks, sorted(os.listdir(args.input_masks))[0])).convert("RGB") 32 | 33 | # Convert mask to numpy array 34 | mask_np = np.array(mask) 35 | 36 | # Create a kernel for dilation 37 | kernel = np.ones((19, 19), np.uint8) 38 | 39 | # Dilate the mask 40 | dilated_mask = cv2.dilate(mask_np, kernel, iterations=9) 41 | 42 | # Convert back to PIL Image 43 | mask_image = Image.fromarray(dilated_mask) 44 | 45 | generator = torch.Generator().manual_seed(args.seed) 46 | 47 | output_image = pipe( 48 | prompt=args.prompt, 49 | image=image, 50 | mask_image=mask_image, 51 | negative_prompt=args.negative_prompt, 52 | num_inference_steps=args.num_inference_steps, 53 | guidance_scale=args.guidance_scale, 54 | height=args.height, 55 | width=args.width, 56 | generator=generator 57 | ).images[0] 58 | 59 | # Save image 60 | save_path = os.path.join(args.outdir, 'image_editing_results') 61 | os.makedirs(save_path, exist_ok=True) 62 | filename = args.prompt.replace('.', '').replace(' ', '_') 63 | output_image.save(os.path.join(save_path, f'{filename}.png')) -------------------------------------------------------------------------------- /scripts/extract_youtube_vos_depths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | from PIL import Image 5 | from tqdm import tqdm 6 | from models.midas.midas import DepthMidas 7 | 8 | device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--midas-path', type=str, default='', help='Path to MiDaS model weights') 12 | parser.add_argument('--dataset-path', type=str, default='input_image_examples', help='Path to input image dataset') 13 | parser.add_argument('--outdir', type=str, default='output_depth_examples', help='Output directory for depth maps') 14 | 15 | args = parser.parse_args() 16 | 17 | device = args.device 18 | midas_path = args.midas_path 19 | dataset_path = args.dataset_path 20 | outdir = args.outdir 21 | os.makedirs(outdir, exist_ok=True) 22 | 23 | depth_estimator = DepthMidas(model_path=midas_path, 24 | device=device) 25 | video_paths = os.listdir(dataset_path) 26 | progress_bar = tqdm(total=len(video_paths)) 27 | for video_path in video_paths: 28 | progress_bar.update(1) 29 | os.makedirs(os.path.join(outdir, video_path), exist_ok=True) 30 | frame_paths = os.listdir(os.path.join(dataset_path, video_path)) 31 | frames_pil = [] 32 | frame_name_list = [] 33 | for frame_path in frame_paths: 34 | video_frame = Image.open(os.path.join(dataset_path, video_path, frame_path)) 35 | frames_pil.append(video_frame) 36 | frame_name_list.append(frame_path.split('.')[0]) 37 | depths_pil = depth_estimator.estimate(frames_pil) 38 | for depth, frame_name in zip(depths_pil, frame_name_list): 39 | depth.save(os.path.join(outdir, video_path, frame_name + '.png')) -------------------------------------------------------------------------------- /scripts/run_dilate_mask.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | def dilate_mask(mask, kernel_size=11, iterations=1): 7 | kernel = np.ones((kernel_size, kernel_size), np.uint8) 8 | dilated_mask = cv2.dilate(mask, kernel, iterations=iterations) 9 | return dilated_mask 10 | 11 | def process_masks(input_folder, output_folder, kernel_size=5, iterations=1): 12 | # Create output folder if it doesn't exist 13 | os.makedirs(output_folder, exist_ok=True) 14 | 15 | # Get all files in the input folder 16 | if input_folder.endswith('.png') or input_folder.endswith('.jpg'): 17 | mask_files = [input_folder] 18 | else: 19 | mask_files = [f for f in os.listdir(input_folder) if f.endswith('.png') or f.endswith('.jpg')] 20 | 21 | for mask_file in tqdm(mask_files, desc="Processing masks"): 22 | # Read the mask 23 | mask_path = os.path.join(input_folder, mask_file) 24 | mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE) 25 | 26 | # Dilate the mask 27 | dilated_mask = dilate_mask(mask, kernel_size, iterations) 28 | 29 | 30 | # Save the dilated mask 31 | output_path = os.path.join(output_folder, mask_file) 32 | cv2.imwrite(output_path, dilated_mask) 33 | 34 | if __name__ == "__main__": 35 | import argparse 36 | 37 | parser = argparse.ArgumentParser(description="Dilate masks and save them in a new folder") 38 | parser.add_argument("--input-folder", type=str, required=True, help="Path to the folder containing input masks") 39 | parser.add_argument("--output-folder", type=str, required=True, help="Path to the folder to save dilated masks") 40 | parser.add_argument("--kernel-size", type=int, default=15, help="Kernel size for dilation (default: 5)") 41 | parser.add_argument("--iterations", type=int, default=1, help="Number of dilation iterations (default: 1)") 42 | 43 | args = parser.parse_args() 44 | 45 | process_masks(args.input_folder, args.output_folder, args.kernel_size, args.iterations) 46 | print(f"Dilated masks saved in {args.output_folder}.") 47 | -------------------------------------------------------------------------------- /utils/__pycache__/file_client.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/file_client.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/flow_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/flow_utils.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/loss_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/loss_utils.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/lr_scheduler_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/lr_scheduler_utils.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/mask_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/mask_utils.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/utils.cpython-310.pyc --------------------------------------------------------------------------------