├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── checkpoints
    └── model-weights-here.txt
├── datasets
    └── youtube_vos.py
├── inference.py
├── inputs
    ├── dilated-masks
    │   └── car-turn.png
    ├── edited-first-frames
    │   ├── bear-elephant.png
    │   └── car-turn-inpainted.png
    ├── frames
    │   ├── bear
    │   │   ├── 00000.jpg
    │   │   ├── 00001.jpg
    │   │   ├── 00002.jpg
    │   │   ├── 00003.jpg
    │   │   ├── 00004.jpg
    │   │   ├── 00005.jpg
    │   │   ├── 00006.jpg
    │   │   ├── 00007.jpg
    │   │   ├── 00008.jpg
    │   │   ├── 00009.jpg
    │   │   ├── 00010.jpg
    │   │   ├── 00011.jpg
    │   │   ├── 00012.jpg
    │   │   ├── 00013.jpg
    │   │   ├── 00014.jpg
    │   │   └── 00015.jpg
    │   └── car-turn
    │   │   ├── 00000.jpg
    │   │   ├── 00001.jpg
    │   │   ├── 00002.jpg
    │   │   ├── 00003.jpg
    │   │   ├── 00004.jpg
    │   │   ├── 00005.jpg
    │   │   ├── 00006.jpg
    │   │   ├── 00007.jpg
    │   │   ├── 00008.jpg
    │   │   ├── 00009.jpg
    │   │   ├── 00010.jpg
    │   │   ├── 00011.jpg
    │   │   ├── 00012.jpg
    │   │   ├── 00013.jpg
    │   │   ├── 00014.jpg
    │   │   └── 00015.jpg
    ├── hand-drawn-sketches
    │   └── bear-elephant-sketch.png
    ├── masks
    │   ├── bear.png
    │   └── car-turn.png
    └── reference-images
    │   └── raccoon.jpg
├── install_conda.sh
├── install_pip.sh
├── models
    ├── __init__.py
    ├── anydoor
    │   ├── .gitignore
    │   ├── LICENSE.txt
    │   ├── assets
    │   │   └── Figures
    │   │   │   ├── Teaser.png
    │   │   │   ├── gradio.png
    │   │   │   └── tryon.png
    │   ├── cldm
    │   │   ├── cldm.py
    │   │   ├── ddim_hacked.py
    │   │   ├── hack.py
    │   │   ├── logger.py
    │   │   └── model.py
    │   ├── cog.yaml
    │   ├── configs
    │   │   ├── anydoor.yaml
    │   │   ├── datasets.yaml
    │   │   ├── demo.yaml
    │   │   └── inference.yaml
    │   ├── datasets
    │   │   ├── Preprocess
    │   │   │   ├── mvimagenet.txt
    │   │   │   └── uvo_process.py
    │   │   ├── base.py
    │   │   ├── data_utils.py
    │   │   ├── dreambooth.py
    │   │   ├── dresscode.py
    │   │   ├── fashiontryon.py
    │   │   ├── lvis.py
    │   │   ├── mose.py
    │   │   ├── mvimagenet.py
    │   │   ├── saliency_modular.py
    │   │   ├── sam.py
    │   │   ├── uvo.py
    │   │   ├── uvo_val.py
    │   │   ├── vipseg.py
    │   │   ├── vitonhd.py
    │   │   ├── ytb_vis.py
    │   │   └── ytb_vos.py
    │   ├── dinov2
    │   │   ├── .github
    │   │   │   └── workflows
    │   │   │   │   └── lint.yaml
    │   │   ├── .gitignore
    │   │   ├── CODE_OF_CONDUCT.md
    │   │   ├── CONTRIBUTING.md
    │   │   ├── LICENSE
    │   │   ├── MODEL_CARD.md
    │   │   ├── README.md
    │   │   ├── conda.yaml
    │   │   ├── dinov2
    │   │   │   ├── __init__.py
    │   │   │   ├── configs
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── eval
    │   │   │   │   │   ├── vitb14_pretrain.yaml
    │   │   │   │   │   ├── vitg14_pretrain.yaml
    │   │   │   │   │   ├── vitl14_pretrain.yaml
    │   │   │   │   │   └── vits14_pretrain.yaml
    │   │   │   │   ├── ssl_default_config.yaml
    │   │   │   │   └── train
    │   │   │   │   │   ├── vitg14.yaml
    │   │   │   │   │   ├── vitl14.yaml
    │   │   │   │   │   └── vitl16_short.yaml
    │   │   │   ├── data
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── adapters.py
    │   │   │   │   ├── augmentations.py
    │   │   │   │   ├── collate.py
    │   │   │   │   ├── datasets
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── decoders.py
    │   │   │   │   │   ├── extended.py
    │   │   │   │   │   ├── image_net.py
    │   │   │   │   │   └── image_net_22k.py
    │   │   │   │   ├── loaders.py
    │   │   │   │   ├── masking.py
    │   │   │   │   ├── samplers.py
    │   │   │   │   └── transforms.py
    │   │   │   ├── distributed
    │   │   │   │   └── __init__.py
    │   │   │   ├── eval
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── knn.py
    │   │   │   │   ├── linear.py
    │   │   │   │   ├── log_regression.py
    │   │   │   │   ├── metrics.py
    │   │   │   │   ├── setup.py
    │   │   │   │   └── utils.py
    │   │   │   ├── fsdp
    │   │   │   │   └── __init__.py
    │   │   │   ├── layers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── block.py
    │   │   │   │   ├── dino_head.py
    │   │   │   │   ├── drop_path.py
    │   │   │   │   ├── layer_scale.py
    │   │   │   │   ├── mlp.py
    │   │   │   │   ├── patch_embed.py
    │   │   │   │   └── swiglu_ffn.py
    │   │   │   ├── logging
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── helpers.py
    │   │   │   ├── loss
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── dino_clstoken_loss.py
    │   │   │   │   ├── ibot_patch_loss.py
    │   │   │   │   └── koleo_loss.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_transformer.py
    │   │   │   ├── run
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── eval
    │   │   │   │   │   ├── knn.py
    │   │   │   │   │   ├── linear.py
    │   │   │   │   │   └── log_regression.py
    │   │   │   │   ├── submit.py
    │   │   │   │   └── train
    │   │   │   │   │   └── train.py
    │   │   │   ├── train
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ssl_meta_arch.py
    │   │   │   │   └── train.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cluster.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── dtype.py
    │   │   │   │   ├── param_groups.py
    │   │   │   │   └── utils.py
    │   │   ├── hubconf.py
    │   │   ├── pyproject.toml
    │   │   ├── requirements-dev.txt
    │   │   ├── requirements.txt
    │   │   ├── scripts
    │   │   │   └── lint.sh
    │   │   ├── setup.cfg
    │   │   └── setup.py
    │   ├── environment.yaml
    │   ├── examples
    │   │   ├── Gradio
    │   │   │   ├── BG
    │   │   │   │   ├── 00.png
    │   │   │   │   ├── 01.png
    │   │   │   │   ├── 02.png
    │   │   │   │   ├── 03.png
    │   │   │   │   ├── 04.jpg
    │   │   │   │   ├── 04.png
    │   │   │   │   ├── 06.png
    │   │   │   │   ├── 07.png
    │   │   │   │   ├── 08.jpg
    │   │   │   │   ├── 13.jpg
    │   │   │   │   ├── 17.jpg
    │   │   │   │   └── 22.png
    │   │   │   └── FG
    │   │   │   │   ├── 00.jpg
    │   │   │   │   ├── 01.jpg
    │   │   │   │   ├── 04.jpg
    │   │   │   │   ├── 06.jpg
    │   │   │   │   ├── 07.png
    │   │   │   │   ├── 09.jpg
    │   │   │   │   ├── 18.png
    │   │   │   │   ├── 22.jpg
    │   │   │   │   ├── 25.png
    │   │   │   │   ├── 28.png
    │   │   │   │   ├── 33.png
    │   │   │   │   ├── 36.jpg
    │   │   │   │   ├── 39.jpg
    │   │   │   │   ├── 43.jpg
    │   │   │   │   ├── 44.jpg
    │   │   │   │   └── 50.jpg
    │   │   └── TestDreamBooth
    │   │   │   ├── BG
    │   │   │       ├── 000000047948_GT.png
    │   │   │       ├── 000000047948_mask.png
    │   │   │       ├── 000000309203_GT.png
    │   │   │       └── 000000309203_mask.png
    │   │   │   ├── FG
    │   │   │       ├── 00.png
    │   │   │       ├── 01.png
    │   │   │       ├── 02.png
    │   │   │       └── 03.png
    │   │   │   └── GEN
    │   │   │       └── gen_res.png
    │   ├── iseg
    │   │   ├── coarse_mask_refine.pth
    │   │   └── coarse_mask_refine_util.py
    │   ├── ldm
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   └── util.py
    │   │   ├── models
    │   │   │   ├── autoencoder.py
    │   │   │   └── diffusion
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ddim.py
    │   │   │   │   ├── ddpm.py
    │   │   │   │   ├── dpm_solver
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── dpm_solver.py
    │   │   │   │       └── sampler.py
    │   │   │   │   ├── plms.py
    │   │   │   │   └── sampling_util.py
    │   │   ├── modules
    │   │   │   ├── attention.py
    │   │   │   ├── diffusionmodules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── openaimodel.py
    │   │   │   │   ├── upscaling.py
    │   │   │   │   └── util.py
    │   │   │   ├── distributions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── distributions.py
    │   │   │   ├── ema.py
    │   │   │   ├── encoders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── modules.py
    │   │   │   ├── image_degradation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bsrgan.py
    │   │   │   │   ├── bsrgan_light.py
    │   │   │   │   ├── utils
    │   │   │   │   │   └── test.png
    │   │   │   │   └── utils_image.py
    │   │   │   └── midas
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── api.py
    │   │   │   │   ├── midas
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── base_model.py
    │   │   │   │       ├── blocks.py
    │   │   │   │       ├── dpt_depth.py
    │   │   │   │       ├── midas_net.py
    │   │   │   │       ├── midas_net_custom.py
    │   │   │   │       ├── transforms.py
    │   │   │   │       └── vit.py
    │   │   │   │   └── utils.py
    │   │   └── util.py
    │   ├── predict.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   ├── run_dataset_debug.py
    │   ├── run_gradio_demo.py
    │   ├── run_inference.py
    │   ├── run_train_anydoor.py
    │   ├── scripts
    │   │   ├── convert_weight.sh
    │   │   ├── inference.sh
    │   │   └── train.sh
    │   └── tool_add_control_sd21.py
    ├── canny
    │   ├── __pycache__
    │   │   ├── canny_filter.cpython-310.pyc
    │   │   ├── filter.cpython-310.pyc
    │   │   ├── gaussian.cpython-310.pyc
    │   │   ├── kernels.cpython-310.pyc
    │   │   └── sobel.cpython-310.pyc
    │   ├── canny_filter.py
    │   ├── filter.py
    │   ├── gaussian.py
    │   ├── kernels.py
    │   └── sobel.py
    ├── controlnet_inpaint
    │   └── pipeline.py
    ├── ctrl_adapter
    │   ├── __init__.py
    │   ├── adapter_spatial_temporal.py
    │   ├── controlnet.py
    │   ├── ctrl_adapter.py
    │   └── resnet_block_2d.py
    ├── depth_completion_net
    │   ├── deformation_net.py
    │   ├── deformconv.py
    │   └── rfc_net.py
    ├── i2vgenxl
    │   ├── i2vgenxl_ctrl_adapter_pipeline.py
    │   └── i2vgenxl_unet.py
    ├── midas
    │   ├── backbones
    │   │   ├── __pycache__
    │   │   │   ├── beit.cpython-310.pyc
    │   │   │   ├── levit.cpython-310.pyc
    │   │   │   ├── swin.cpython-310.pyc
    │   │   │   ├── swin2.cpython-310.pyc
    │   │   │   ├── swin_common.cpython-310.pyc
    │   │   │   ├── utils.cpython-310.pyc
    │   │   │   └── vit.cpython-310.pyc
    │   │   ├── beit.py
    │   │   ├── levit.py
    │   │   ├── next_vit.py
    │   │   ├── swin.py
    │   │   ├── swin2.py
    │   │   ├── swin_common.py
    │   │   ├── utils.py
    │   │   └── vit.py
    │   ├── base_model.py
    │   ├── blocks.py
    │   ├── dpt_depth.py
    │   ├── midas.py
    │   ├── midas_net.py
    │   ├── midas_net_custom.py
    │   ├── model_loader.py
    │   └── transforms.py
    ├── raft
    │   ├── __init__.py
    │   ├── corr.py
    │   ├── extractor.py
    │   ├── raft.py
    │   ├── update.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-310.pyc
    │   │       ├── flow_viz.cpython-310.pyc
    │   │       ├── frame_utils.cpython-310.pyc
    │   │       └── utils.cpython-310.pyc
    │   │   ├── augmentor.py
    │   │   ├── flow_viz.py
    │   │   ├── flow_viz_pt.py
    │   │   ├── frame_utils.py
    │   │   └── utils.py
    └── u2net
    │   ├── __init__.py
    │   └── u2net.py
├── runners
    ├── __init__.py
    ├── anydoor_inference_runner.py
    ├── completion_net_inference_runner.py
    ├── completion_net_train_runner.py
    ├── controlnet_inpaint_inference_runner.py
    ├── i2vgenxl_ctrl_adapter_inference_runner.py
    ├── instructpix2pix_inference_runner.py
    ├── iterative_warping
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── get_averaged_depths.cpython-310.pyc
    │   │   ├── run_flow_extraction.cpython-310.pyc
    │   │   ├── run_torch_average_flow_warping.cpython-310.pyc
    │   │   ├── run_warp_with_averaged_flow.cpython-310.pyc
    │   │   └── warp_utils.cpython-310.pyc
    │   ├── get_averaged_depths.py
    │   ├── get_editing_region.py
    │   ├── raft
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── alt_cuda_corr
    │   │   │   ├── correlation.cpp
    │   │   │   ├── correlation_kernel.cu
    │   │   │   └── setup.py
    │   │   ├── chairs_split.txt
    │   │   ├── core
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   │   ├── corr.cpython-310.pyc
    │   │   │   │   ├── extractor.cpython-310.pyc
    │   │   │   │   ├── raft.cpython-310.pyc
    │   │   │   │   └── update.cpython-310.pyc
    │   │   │   ├── corr.py
    │   │   │   ├── datasets.py
    │   │   │   ├── extractor.py
    │   │   │   ├── raft.py
    │   │   │   ├── update.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-310.pyc
    │   │   │   │       ├── flow_viz.cpython-310.pyc
    │   │   │   │       └── utils.cpython-310.pyc
    │   │   │   │   ├── augmentor.py
    │   │   │   │   ├── flow_viz.py
    │   │   │   │   ├── frame_utils.py
    │   │   │   │   └── utils.py
    │   │   ├── demo.py
    │   │   ├── download_models.sh
    │   │   ├── evaluate.py
    │   │   ├── extract-flow-from-frames.py
    │   │   ├── extract-flow-from-two-images.py
    │   │   ├── train.py
    │   │   ├── train_mixed.sh
    │   │   └── train_standard.sh
    │   ├── run_extract_images_depths.py
    │   ├── run_flow_extraction.py
    │   ├── run_numpy_average_flow_warping.py
    │   ├── run_torch_average_flow_warping.py
    │   ├── run_warp.py
    │   ├── run_warp_with_averaged_flow.py
    │   └── warp_utils.py
    ├── iterative_warping_runner.py
    ├── midas_depth_estimation_runner.py
    ├── paint_by_example_inference_runner.py
    ├── stable_diffusion_inpaint_inference_runner.py
    └── u2net_saliency_detection_runner.py
├── scripts
    ├── extract_youtube_vos_depths.py
    ├── extract_youtube_vos_shapes.py
    ├── inference_controlnet_inpaint.py
    └── run_dilate_mask.py
├── train_completion_net.py
└── utils
    ├── __pycache__
        ├── file_client.cpython-310.pyc
        ├── flow_utils.cpython-310.pyc
        ├── loss_utils.cpython-310.pyc
        ├── lr_scheduler_utils.cpython-310.pyc
        ├── mask_utils.cpython-310.pyc
        └── utils.cpython-310.pyc
    ├── file_client.py
    ├── flow_utils.py
    ├── loss_utils.py
    ├── lr_scheduler_utils.py
    ├── mask_utils.py
    └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 USTC-liuchang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/checkpoints/model-weights-here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/checkpoints/model-weights-here.txt


--------------------------------------------------------------------------------
/inputs/dilated-masks/car-turn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/dilated-masks/car-turn.png


--------------------------------------------------------------------------------
/inputs/edited-first-frames/bear-elephant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/edited-first-frames/bear-elephant.png


--------------------------------------------------------------------------------
/inputs/edited-first-frames/car-turn-inpainted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/edited-first-frames/car-turn-inpainted.png


--------------------------------------------------------------------------------
/inputs/frames/bear/00000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00000.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00001.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00002.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00003.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00004.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00005.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00006.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00007.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00008.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00008.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00009.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00010.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00011.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00012.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00013.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00013.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00014.jpg


--------------------------------------------------------------------------------
/inputs/frames/bear/00015.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/bear/00015.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00000.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00001.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00002.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00003.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00004.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00005.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00006.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00007.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00008.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00008.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00009.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00010.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00011.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00012.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00013.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00013.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00014.jpg


--------------------------------------------------------------------------------
/inputs/frames/car-turn/00015.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/frames/car-turn/00015.jpg


--------------------------------------------------------------------------------
/inputs/hand-drawn-sketches/bear-elephant-sketch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/hand-drawn-sketches/bear-elephant-sketch.png


--------------------------------------------------------------------------------
/inputs/masks/bear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/masks/bear.png


--------------------------------------------------------------------------------
/inputs/masks/car-turn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/masks/car-turn.png


--------------------------------------------------------------------------------
/inputs/reference-images/raccoon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/inputs/reference-images/raccoon.jpg


--------------------------------------------------------------------------------
/install_conda.sh:
--------------------------------------------------------------------------------
1 | conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
2 | conda install opencv-python pillow gradio transformers einops scipy matplotlib omegaconf albumentations accelerate huggingface-hub==0.23.5 diffusers==0.27.2 timm==0.6.7


--------------------------------------------------------------------------------
/install_pip.sh:
--------------------------------------------------------------------------------
 1 | pip install opencv-python
 2 | pip install pillow
 3 | pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
 4 | pip install diffusers==0.27.2
 5 | pip install huggingface-hub==0.23.5
 6 | pip install transformers
 7 | pip install einops
 8 | pip install scipy
 9 | pip install timm==0.6.7
10 | pip install matplotlib
11 | pip install omegaconf
12 | pip install albumentations
13 | pip install accelerate
14 | pip install gradio
15 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | # blank
2 | 
3 | 


--------------------------------------------------------------------------------
/models/anydoor/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | **/.DS_Store
  3 | training/
  4 | lightning_logs/
  5 | image_log/
  6 | 
  7 | #*.pth
  8 | *.pt
  9 | *.ckpt
 10 | *.safetensors
 11 | 
 12 | gradio_pose2image_private.py
 13 | gradio_canny2image_private.py
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | pip-wheel-metadata/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .nox/
 58 | .coverage
 59 | .coverage.*
 60 | .cache
 61 | nosetests.xml
 62 | coverage.xml
 63 | *.cover
 64 | *.py,cover
 65 | .hypothesis/
 66 | .pytest_cache/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | db.sqlite3
 76 | db.sqlite3-journal
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # IPython
 95 | profile_default/
 96 | ipython_config.py
 97 | 
 98 | # pyenv
 99 | .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
109 | __pypackages__/
110 | 
111 | # Celery stuff
112 | celerybeat-schedule
113 | celerybeat.pid
114 | 
115 | # SageMath parsed files
116 | *.sage.py
117 | 
118 | # Environments
119 | .env
120 | .venv
121 | env/
122 | venv/
123 | ENV/
124 | env.bak/
125 | venv.bak/
126 | 
127 | # Spyder project settings
128 | .spyderproject
129 | .spyproject
130 | 
131 | # Rope project settings
132 | .ropeproject
133 | 
134 | # mkdocs documentation
135 | /site
136 | 
137 | # mypy
138 | .mypy_cache/
139 | .dmypy.json
140 | dmypy.json
141 | 
142 | # Pyre type checker
143 | .pyre/
144 | 


--------------------------------------------------------------------------------
/models/anydoor/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 DAMO Vision Intelligence Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/anydoor/assets/Figures/Teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/Teaser.png


--------------------------------------------------------------------------------
/models/anydoor/assets/Figures/gradio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/gradio.png


--------------------------------------------------------------------------------
/models/anydoor/assets/Figures/tryon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/assets/Figures/tryon.png


--------------------------------------------------------------------------------
/models/anydoor/cldm/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | from omegaconf import OmegaConf
 5 | from models.anydoor.ldm.util import instantiate_from_config
 6 | 
 7 | 
 8 | def get_state_dict(d):
 9 |     return d.get('state_dict', d)
10 | 
11 | 
12 | def load_state_dict(ckpt_path, location='cpu'):
13 |     _, extension = os.path.splitext(ckpt_path)
14 |     if extension.lower() == ".safetensors":
15 |         import safetensors.torch
16 |         state_dict = safetensors.torch.load_file(ckpt_path, device=location)
17 |     else:
18 |         state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
19 |     state_dict = get_state_dict(state_dict)
20 |     print(f'Loaded state_dict from [{ckpt_path}]')
21 |     return state_dict
22 | 
23 | 
24 | def create_model(config_path):
25 |     config = OmegaConf.load(config_path)
26 |     model = instantiate_from_config(config.model).cpu()
27 |     print(f'Loaded model config from [{config_path}]')
28 |     return model
29 | 


--------------------------------------------------------------------------------
/models/anydoor/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | build:
 4 |   gpu: true
 5 |   system_packages:
 6 |     - "mesa-common-dev"
 7 |   python_version: "3.8.5"
 8 |   python_packages:
 9 |     - "albumentations==1.3.0"
10 |     - "einops==0.3.0"
11 |     - "fvcore==0.1.5.post20221221"
12 |     - "gradio==3.39.0"
13 |     - "numpy==1.23.1"
14 |     - "omegaconf==2.1.1"
15 |     - "open_clip_torch==2.17.1"
16 |     - "opencv_python==4.7.0.72"
17 |     - "opencv_python_headless==4.7.0.72"
18 |     - "Pillow==9.4.0"
19 |     - "pytorch_lightning==1.5.0"
20 |     - "safetensors==0.2.7"
21 |     - "scipy==1.9.1"
22 |     - "setuptools==66.0.0"
23 |     - "share==1.0.4"
24 |     - "submitit==1.5.1"
25 |     - "timm==0.6.12"
26 |     - "torch==2.0.0"
27 |     - "torchmetrics==0.6.0"
28 |     - "tqdm==4.65.0"
29 |     - "transformers==4.19.2"
30 |     - "xformers==0.0.18"
31 | 
32 |   run:
33 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.3.1/pget" && chmod +x /usr/local/bin/pget
34 | 
35 | # predict.py defines how predictions are run on your model
36 | predict: "predict.py:Predictor"


--------------------------------------------------------------------------------
/models/anydoor/configs/anydoor.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   target: models.anydoor.cldm.cldm.ControlLDM
 3 |   params:
 4 |     linear_start: 0.00085
 5 |     linear_end: 0.0120
 6 |     num_timesteps_cond: 1
 7 |     log_every_t: 200
 8 |     timesteps: 1000
 9 |     first_stage_key: "jpg"
10 |     cond_stage_key: "ref"
11 |     control_key: "hint"
12 |     image_size: 64
13 |     channels: 4
14 |     cond_stage_trainable: false
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     scale_factor: 0.18215
18 |     use_ema: False
19 |     only_mid_control: False
20 | 
21 |     control_stage_config:
22 |       target: models.anydoor.cldm.cldm.ControlNet
23 |       params:
24 |         use_checkpoint: True
25 |         image_size: 32 # unused
26 |         in_channels: 4 
27 |         hint_channels: 4 #3
28 |         model_channels: 320
29 |         attention_resolutions: [ 4, 2, 1 ]
30 |         num_res_blocks: 2
31 |         channel_mult: [ 1, 2, 4, 4 ]
32 |         num_head_channels: 64 # need to fix for flash-attn
33 |         use_spatial_transformer: True
34 |         use_linear_in_transformer: True
35 |         transformer_depth: 1
36 |         context_dim: 1024
37 |         legacy: False
38 | 
39 |     unet_config:
40 |       target: models.anydoor.cldm.cldm.ControlledUnetModel
41 |       params:
42 |         use_checkpoint: True
43 |         image_size: 32 # unused
44 |         in_channels: 4
45 |         out_channels: 4
46 |         model_channels: 320
47 |         attention_resolutions: [ 4, 2, 1 ]
48 |         num_res_blocks: 2
49 |         channel_mult: [ 1, 2, 4, 4 ]
50 |         num_head_channels: 64 # need to fix for flash-attn
51 |         use_spatial_transformer: True
52 |         use_linear_in_transformer: True
53 |         transformer_depth: 1
54 |         context_dim: 1024
55 |         legacy: False
56 | 
57 |     first_stage_config:
58 |       target: models.anydoor.ldm.models.autoencoder.AutoencoderKL
59 |       params:
60 |         embed_dim: 4
61 |         monitor: val/rec_loss
62 |         ddconfig:
63 |           #attn_type: "vanilla-xformers"
64 |           double_z: true
65 |           z_channels: 4
66 |           resolution: 256
67 |           in_channels: 3
68 |           out_ch: 3
69 |           ch: 128
70 |           ch_mult:
71 |           - 1
72 |           - 2
73 |           - 4
74 |           - 4
75 |           num_res_blocks: 2
76 |           attn_resolutions: []
77 |           dropout: 0.0
78 |         lossconfig:
79 |           target: torch.nn.Identity
80 | 
81 |     cond_stage_config:
82 |       target: models.anydoor.ldm.modules.encoders.modules.FrozenDinoV2Encoder
83 |       weight: /path/to/dinov2_vitg14_pretrain.pth
84 |     
85 | 
86 | 


--------------------------------------------------------------------------------
/models/anydoor/configs/datasets.yaml:
--------------------------------------------------------------------------------
 1 | Train:
 2 |   YoutubeVOS:
 3 |     image_dir: path/YTBVOS/train/JPEGImages/
 4 |     anno: path/YTBVOS/train/Annotations
 5 |     meta: path/YTBVOS/train/meta.json
 6 | 
 7 |   YoutubeVIS:
 8 |     image_dir: path/youtubevis/train/JPEGImages/
 9 |     anno: path/youtubevis/train/Annotations/
10 |     meta: path/youtubevis/train/meta.json
11 | 
12 |   VIPSeg:
13 |     image_dir: path/VIPSeg/VIPSeg_720P/images/
14 |     anno: path/VIPSeg/VIPSeg_720P/panomasksRGB/
15 | 
16 |   UVO:
17 |     train:
18 |       image_dir: path/UVO/uvo_frames_sparse
19 |       video_json: path/UVO/UVO_sparse_train_video_with_interpolation.json
20 |       image_json: path/UVO/UVO_sparse_train_video_with_interpolation_reorg.json
21 |     val:
22 |       image_dir: path/UVO/uvo_frames_sparse
23 |       video_json: path/UVO/VideoSparseSet/UVO_sparse_val_video_with_interpolation.json
24 |       image_json: path/UVO/VideoSparseSet/UVO_sparse_val_video_interpolation_reorg.json
25 | 
26 |   Mose:
27 |     image_dir: path/MOSE/train/JPEGImages/
28 |     anno: path/MOSE/train/Annotations/
29 | 
30 |   MVImageNet:
31 |     txt: ./datasets/Preprocess/mvimagenet.txt
32 |     image_dir: /mnt/workspace/xizhi/data/MVImgNet/
33 | 
34 |   VitonHD:
35 |     image_dir: path/TryOn/VitonHD/train/cloth/
36 | 
37 |   Dresscode:
38 |     image_dir: /mnt/workspace/xizhi/data/dresscode/DressCode/upper_body/label_maps/
39 | 
40 |   FashionTryon:
41 |     image_dir: path/TryOn/FashionTryOn/train
42 | 
43 |   Lvis:
44 |     image_dir: path/COCO/train2017
45 |     json_path: path/lvis_v1/lvis_v1_train.json
46 | 
47 |   SAM:
48 |     sub1: path/SAM/0000
49 |     sub2: path/SAM/0001
50 |     sub3: path/SAM/0002
51 |     sub4: path/SAM/0004
52 | 
53 |   Saliency:
54 |     MSRA_root: path/Saliency/MSRA10K_Imgs_GT/
55 |     TR_root: path/Saliency/DUTS-TR/DUTS-TR-Image/
56 |     TE_root: path/Saliency/DUTS-TE/DUTS-TE-Image/
57 |     HFlickr_root: path/HFlickr/masks/
58 | 
59 | Test:
60 |   DreamBooth:
61 |     fg_dir: path/DreamBooth/AnyDoor_DreamBooth
62 |     bg_dir: path/DreamBooth/v1_800
63 | 
64 |   VitonHDTest:
65 |     image_dir: path/TryOn/VitonHD/test/cloth
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/models/anydoor/configs/demo.yaml:
--------------------------------------------------------------------------------
1 | pretrained_model: path/epoch=1-step=8687.ckpt
2 | config_file: configs/anydoor.yaml
3 | save_memory: False
4 | use_interactive_seg: True
5 | 


--------------------------------------------------------------------------------
/models/anydoor/configs/inference.yaml:
--------------------------------------------------------------------------------
1 | pretrained_model: /Users/liuchang/Desktop/Workspaces/checkpoints/anydoor/epoch=1-step=8687.ckpt
2 | config_file: /Users/liuchang/Desktop/Workspaces/code/shape-consistent-video-editing/iterative-warping/models/anydoor/configs/anydoor.yaml
3 | save_memory: False
4 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/Preprocess/uvo_process.py:
--------------------------------------------------------------------------------
 1 | import cv2 
 2 | import json 
 3 | import os
 4 | from pycocotools import mask as mask_utils
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | json_path = 'path/UVO/UVO_sparse_train_video_with_interpolation.json'
 9 | output_path = "path/UVO/UVO_sparse_train_video_with_interpolation_reorg.json"
10 | 
11 | with open(json_path, 'r') as fcc_file:
12 |     data = json.load(fcc_file)
13 | 
14 | info = data['info']
15 | videos = data['videos']
16 | print(len(videos))
17 | 
18 | 
19 | uvo_dict = {}
20 | for video in tqdm(videos):
21 |     vid = video['id']
22 |     file_names = video['file_names']
23 |     uvo_dict[vid] = file_names
24 | 
25 | 
26 | with open(output_path,"w") as f:
27 |     json.dump(uvo_dict,f)
28 |     print('finish')
29 | 
30 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/dreambooth.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | 
11 | class DreamBoothDataset(BaseDataset):
12 |     def __init__(self, fg_dir, bg_dir):
13 |         self.bg_dir = bg_dir
14 |         bg_data = os.listdir(self.bg_dir)
15 |         self.bg_data = [i for i in bg_data if 'mask' in i]
16 |         self.image_dir = fg_dir
17 |         self.data  = os.listdir(self.image_dir)
18 |         self.size = (512,512)
19 |         self.clip_size = (224,224)
20 |         '''
21 |          Dynamic:
22 |             0: Static View, High Quality
23 |             1: Multi-view, Low Quality
24 |             2: Multi-view, High Quality
25 |         '''
26 |         self.dynamic = 1 
27 | 
28 |     def __len__(self):
29 |         return len(self.data)
30 | 
31 |     def __getitem__(self, idx):
32 |         idx = np.random.randint(0, len(self.data)-1)
33 |         item = self.get_sample(idx)
34 |         return item
35 | 
36 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
37 |         pass_flag = True
38 |         H,W = image.shape[0], image.shape[1]
39 |         H,W = H * ratio, W * ratio
40 |         y1,y2,x1,x2 = yyxx
41 |         h,w = y2-y1,x2-x1
42 |         if mode == 'max':
43 |             if h > H and w > W:
44 |                 pass_flag = False
45 |         elif mode == 'min':
46 |             if h < H and w < W:
47 |                 pass_flag = False
48 |         return pass_flag
49 | 
50 |     def get_alpha_mask(self, mask_path):
51 |         image = cv2.imread( mask_path, cv2.IMREAD_UNCHANGED)
52 |         mask = (image[:,:,-1] > 128).astype(np.uint8)
53 |         return mask
54 |         
55 |     def get_sample(self, idx):
56 |         dir_name = self.data[idx]
57 |         dir_path = os.path.join(self.image_dir, dir_name)
58 |         images = os.listdir(dir_path)
59 |         image_name = [i for i in images if '.png' in i][0]
60 |         image_path = os.path.join(dir_path, image_name)
61 | 
62 |         image = cv2.imread( image_path, cv2.IMREAD_UNCHANGED)
63 |         mask = (image[:,:,-1] > 128).astype(np.uint8)
64 |         image = image[:,:,:-1]
65 | 
66 |         image = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2RGB)
67 |         ref_image = image 
68 |         ref_mask = mask
69 |         ref_image, ref_mask = expand_image_mask(image, mask, ratio=1.4)
70 |         bg_idx =  np.random.randint(0, len(self.bg_data)-1)
71 |         
72 |         tar_mask_name = self.bg_data[bg_idx]
73 |         tar_mask_path = os.path.join(self.bg_dir, tar_mask_name)
74 |         tar_image_path = tar_mask_path.replace('_mask','_GT')
75 | 
76 |         tar_image = cv2.imread(tar_image_path).astype(np.uint8)
77 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
78 |         tar_mask = (cv2.imread(tar_mask_path) > 128).astype(np.uint8)[:,:,0] 
79 | 
80 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
81 |         sampled_time_steps = self.sample_timestep()
82 |         item_with_collage['time_steps'] = sampled_time_steps
83 |         return item_with_collage
84 | 
85 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/dresscode.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | import albumentations as A
11 | 
12 | class DresscodeDataset(BaseDataset):
13 |     def __init__(self, image_dir):
14 |         self.image_root =  image_dir 
15 |         self.data = os.listdir(self.image_root)
16 |         self.size = (512,512)
17 |         self.clip_size = (224,224)
18 |         self.dynamic = 2
19 | 
20 |     def __len__(self):
21 |         return 20000
22 | 
23 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
24 |         pass_flag = True
25 |         H,W = image.shape[0], image.shape[1]
26 |         H,W = H * ratio, W * ratio
27 |         y1,y2,x1,x2 = yyxx
28 |         h,w = y2-y1,x2-x1
29 |         if mode == 'max':
30 |             if h > H and w > W:
31 |                 pass_flag = False
32 |         elif mode == 'min':
33 |             if h < H and w < W:
34 |                 pass_flag = False
35 |         return pass_flag
36 | 
37 |     def get_sample(self, idx):
38 |         tar_mask_path = os.path.join(self.image_root, self.data[idx])
39 |         tar_image_path = tar_mask_path.replace('label_maps/','images/').replace('_4.png','_0.jpg')
40 |         ref_image_path = tar_mask_path.replace('label_maps/','images/').replace('_4.png','_1.jpg')
41 | 
42 |         # Read Image and Mask
43 |         ref_image = cv2.imread(ref_image_path)
44 |         ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
45 | 
46 |         tar_image = cv2.imread(tar_image_path)
47 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
48 | 
49 |         ref_mask = (ref_image < 240).astype(np.uint8)[:,:,0]
50 |         
51 | 
52 |         tar_mask = Image.open(tar_mask_path ).convert('P')
53 |         tar_mask= np.array(tar_mask)
54 |         tar_mask = tar_mask == 4
55 | 
56 | 
57 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0)
58 |         sampled_time_steps = self.sample_timestep()
59 |         item_with_collage['time_steps'] = sampled_time_steps
60 |         return item_with_collage
61 | 
62 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/fashiontryon.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | import albumentations as A
11 | 
12 | class FashionTryonDataset(BaseDataset):
13 |     def __init__(self, image_dir):
14 |         self.image_root = image_dir
15 |         self.data =os.listdir(self.image_root)
16 |         self.size = (512,512)
17 |         self.clip_size = (224,224)
18 |         self.dynamic = 2
19 | 
20 |     def __len__(self):
21 |         return 5000
22 | 
23 |     def aug_data(self, image):
24 |         transform = A.Compose([
25 |             A.RandomBrightnessContrast(p=0.5),
26 |             ])
27 |         transformed = transform(image=image.astype(np.uint8))
28 |         transformed_image = transformed["image"]
29 |         return transformed_image
30 | 
31 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
32 |         pass_flag = True
33 |         H,W = image.shape[0], image.shape[1]
34 |         H,W = H * ratio, W * ratio
35 |         y1,y2,x1,x2 = yyxx
36 |         h,w = y2-y1,x2-x1
37 |         if mode == 'max':
38 |             if h > H and w > W:
39 |                 pass_flag = False
40 |         elif mode == 'min':
41 |             if h < H and w < W:
42 |                 pass_flag = False
43 |         return pass_flag
44 |             
45 |     def get_sample(self, idx):
46 |         cloth_dir = os.path.join(self.image_root, self.data[idx])
47 |         ref_image_path = os.path.join(cloth_dir, 'target.jpg')
48 | 
49 |         ref_image = cv2.imread(ref_image_path)
50 |         ref_image = cv2.cvtColor(ref_image.copy(), cv2.COLOR_BGR2RGB)
51 | 
52 |         ref_mask_path = os.path.join(cloth_dir,'mask.jpg')
53 |         ref_mask = cv2.imread(ref_mask_path)[:,:,0] > 128
54 | 
55 |         target_dirs = [i for i in os.listdir(cloth_dir ) if '.jpg' not in i]
56 |         target_dir_name = np.random.choice(target_dirs)
57 | 
58 |         target_image_path = os.path.join(cloth_dir, target_dir_name + '.jpg')
59 |         target_image= cv2.imread(target_image_path)
60 |         tar_image = cv2.cvtColor(target_image.copy(), cv2.COLOR_BGR2RGB)
61 | 
62 |         target_mask_path = os.path.join(cloth_dir, target_dir_name, 'segment.png')
63 |         tar_mask= cv2.imread(target_mask_path)[:,:,0]
64 |         target_mask =  tar_mask == 7        
65 |         kernel = np.ones((3, 3), dtype=np.uint8)
66 |         tar_mask = cv2.erode(target_mask.astype(np.uint8), kernel, iterations=3)
67 | 
68 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0)
69 |         sampled_time_steps = self.sample_timestep()
70 |         item_with_collage['time_steps'] = sampled_time_steps
71 |         return item_with_collage
72 | 
73 | 
74 | 
75 |  


--------------------------------------------------------------------------------
/models/anydoor/datasets/lvis.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | from pycocotools import mask as mask_utils
11 | from lvis import LVIS
12 | 
13 | class LvisDataset(BaseDataset):
14 |     def __init__(self, image_dir, json_path):
15 |         self.image_dir = image_dir
16 |         self.json_path = json_path
17 |         lvis_api = LVIS(json_path)
18 |         img_ids = sorted(lvis_api.imgs.keys())
19 |         imgs = lvis_api.load_imgs(img_ids)
20 |         anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
21 |         self.data = imgs
22 |         self.annos = anns
23 |         self.lvis_api = lvis_api
24 |         self.size = (512,512)
25 |         self.clip_size = (224,224)
26 |         self.dynamic = 0
27 | 
28 |     def register_subset(self, path):
29 |         data = os.listdir(path)
30 |         data = [ os.path.join(path, i) for i in data if '.json' in i]
31 |         self.data = self.data + data
32 | 
33 |     def get_sample(self, idx):
34 |         # ==== get pairs =====
35 |         image_name = self.data[idx]['coco_url'].split('/')[-1]
36 |         image_path = os.path.join(self.image_dir, image_name)
37 |         image = cv2.imread(image_path)
38 |         ref_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
39 | 
40 |         anno = self.annos[idx]
41 |         obj_ids = []
42 |         for i in range(len(anno)):
43 |             obj = anno[i]
44 |             area = obj['area']
45 |             if area > 3600:
46 |                 obj_ids.append(i)
47 |         assert len(anno) > 0
48 |         obj_id = np.random.choice(obj_ids)
49 |         anno = anno[obj_id]
50 |         ref_mask = self.lvis_api.ann_to_mask(anno)
51 | 
52 |         tar_image, tar_mask = ref_image.copy(), ref_mask.copy()
53 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
54 |         sampled_time_steps = self.sample_timestep()
55 |         item_with_collage['time_steps'] = sampled_time_steps
56 |         return item_with_collage
57 | 
58 |     def __len__(self):
59 |         return 20000
60 | 
61 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
62 |         pass_flag = True
63 |         H,W = image.shape[0], image.shape[1]
64 |         H,W = H * ratio, W * ratio
65 |         y1,y2,x1,x2 = yyxx
66 |         h,w = y2-y1,x2-x1
67 |         if mode == 'max':
68 |             if h > H or w > W:
69 |                 pass_flag = False
70 |         elif mode == 'min':
71 |             if h < H or w < W:
72 |                 pass_flag = False
73 |         return pass_flag
74 | 
75 | 
76 | 
77 |         
78 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/mvimagenet.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | 
11 | class MVImageNetDataset(BaseDataset):
12 |     def __init__(self, txt, image_dir):
13 |         with open(txt,"r") as f:
14 |             data = f.read().split('\n')[:-1]    
15 |         self.image_dir = image_dir 
16 |         self.data = data
17 |         self.size = (512,512)
18 |         self.clip_size = (224,224)
19 |         self.dynamic = 2
20 | 
21 |     def __len__(self):
22 |         return 40000
23 | 
24 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
25 |         pass_flag = True
26 |         H,W = image.shape[0], image.shape[1]
27 |         H,W = H * ratio, W * ratio
28 |         y1,y2,x1,x2 = yyxx
29 |         h,w = y2-y1,x2-x1
30 |         if mode == 'max':
31 |             if h > H and w > W:
32 |                 pass_flag = False
33 |         elif mode == 'min':
34 |             if h < H and w < W:
35 |                 pass_flag = False
36 |         return pass_flag
37 | 
38 |     def get_alpha_mask(self, mask_path):
39 |         image = cv2.imread( mask_path, cv2.IMREAD_UNCHANGED)
40 |         mask = (image[:,:,-1] > 128).astype(np.uint8)
41 |         return mask
42 |         
43 |     def get_sample(self, idx):
44 |         object_dir = self.data[idx].replace('MVDir/', self.image_dir) 
45 |         frames = os.listdir(object_dir)
46 |         frames = [ i for i in frames if '.png' in i]
47 | 
48 |         # Sampling frames
49 |         min_interval = len(frames)  // 8
50 |         start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval)
51 |         end_frame_index = start_frame_index + np.random.randint(min_interval,  len(frames) - start_frame_index )
52 |         end_frame_index = min(end_frame_index, len(frames) - 1)
53 | 
54 |         # Get image path
55 |         ref_mask_name = frames[start_frame_index]
56 |         tar_mask_name = frames[end_frame_index]
57 | 
58 |         ref_image_name = ref_mask_name.split('_')[0] + '.jpg'
59 |         tar_image_name = tar_mask_name.split('_')[0] + '.jpg'
60 | 
61 |         ref_mask_path = os.path.join(object_dir, ref_mask_name)
62 |         tar_mask_path = os.path.join(object_dir, tar_mask_name)
63 |         ref_image_path = os.path.join(object_dir, ref_image_name)
64 |         tar_image_path = os.path.join(object_dir, tar_image_name) 
65 | 
66 |         # Read Image and Mask
67 |         ref_image = cv2.imread(ref_image_path).astype(np.uint8)
68 |         ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
69 | 
70 |         tar_image = cv2.imread(tar_image_path).astype(np.uint8)
71 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
72 | 
73 |         ref_mask = self.get_alpha_mask(ref_mask_path)
74 |         tar_mask = self.get_alpha_mask(tar_mask_path)
75 | 
76 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
77 |         sampled_time_steps = self.sample_timestep()
78 |         item_with_collage['time_steps'] = sampled_time_steps
79 | 
80 |         return item_with_collage
81 | 
82 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/sam.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | from pycocotools import mask as mask_utils
11 | 
12 | class SAMDataset(BaseDataset):
13 |     def __init__(self, sub1, sub2, sub3, sub4):
14 |         image_mask_dict = {}
15 |         self.data = []
16 |         self.register_subset(sub1)
17 |         self.register_subset(sub2)
18 |         self.register_subset(sub3)
19 |         self.register_subset(sub4)
20 |         self.size = (512,512)
21 |         self.clip_size = (224,224)
22 |         self.dynamic = 0
23 | 
24 |     def register_subset(self, path):
25 |         data = os.listdir(path)
26 |         data = [ os.path.join(path, i) for i in data if '.json' in i]
27 |         self.data = self.data + data
28 | 
29 |     def get_sample(self, idx):
30 |         # ==== get pairs =====
31 |         json_path = self.data[idx]
32 |         image_path = json_path.replace('.json', '.jpg')
33 | 
34 |         with open(json_path, 'r') as json_file:
35 |             data = json.load(json_file)
36 |         annotation = data['annotations']
37 | 
38 |         valid_ids = []
39 |         for i in range(len(annotation)):
40 |             area = annotation[i]['area']
41 |             if area > 100 * 100 * 5:
42 |                 valid_ids.append(i)
43 | 
44 |         chosen_id = np.random.choice(valid_ids)
45 |         mask = mask_utils.decode(annotation[chosen_id]["segmentation"] )
46 |         # ======================
47 | 
48 |         image = cv2.imread(image_path)
49 |         ref_image = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2RGB)
50 |         tar_image = ref_image
51 |         
52 |         ref_mask = mask
53 |         tar_mask = mask
54 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
55 |         sampled_time_steps = self.sample_timestep()
56 |         item_with_collage['time_steps'] = sampled_time_steps
57 |         return item_with_collage
58 | 
59 |     def __len__(self):
60 |         return 20000
61 | 
62 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
63 |         pass_flag = True
64 |         H,W = image.shape[0], image.shape[1]
65 |         H,W = H * ratio, W * ratio
66 |         y1,y2,x1,x2 = yyxx
67 |         h,w = y2-y1,x2-x1
68 |         if mode == 'max':
69 |             if h > H or w > W:
70 |                 pass_flag = False
71 |         elif mode == 'min':
72 |             if h < H or w < W:
73 |                 pass_flag = False
74 |         return pass_flag
75 | 
76 | 
77 | 
78 |         
79 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/uvo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | from pycocotools import mask as mask_utils
11 | 
12 | class UVODataset(BaseDataset):
13 |     def __init__(self, image_dir, video_json, image_json):
14 |         json_path = video_json 
15 |         with open(json_path, 'r') as fcc_file:
16 |             data = json.load(fcc_file)
17 | 
18 |         image_json_path = image_json
19 |         with open(image_json_path , 'r') as image_file:
20 |             video_dict = json.load(image_file)
21 | 
22 |         self.image_root =  image_dir
23 |         self.data = data['annotations']
24 |         self.video_dict = video_dict
25 |         self.size = (512,512)
26 |         self.clip_size = (224,224)
27 |         self.dynamic = 1
28 | 
29 |     def __len__(self):
30 |         return 25000
31 | 
32 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
33 |         pass_flag = True
34 |         H,W = image.shape[0], image.shape[1]
35 |         H,W = H * ratio, W * ratio
36 |         y1,y2,x1,x2 = yyxx
37 |         h,w = y2-y1,x2-x1
38 |         if mode == 'max':
39 |             if h > H and w > W:
40 |                 pass_flag = False
41 |         elif mode == 'min':
42 |             if h < H and w < W:
43 |                 pass_flag = False
44 |         return pass_flag
45 | 
46 |     def get_sample(self, idx):
47 |         ins_anno = self.data[idx]
48 |         video_id = str(ins_anno['video_id'])
49 |         video_names = self.video_dict[video_id]
50 |         masks = ins_anno['segmentations']
51 |         frames = video_names
52 | 
53 |         # Sampling frames
54 |         min_interval = len(frames)  // 10
55 |         start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval)
56 |         end_frame_index = start_frame_index + np.random.randint(min_interval,  len(frames) - start_frame_index )
57 |         end_frame_index = min(end_frame_index, len(frames) - 1)
58 | 
59 |         # Get image path
60 |         ref_image_name = frames[start_frame_index]
61 |         tar_image_name = frames[end_frame_index]
62 |         ref_image_path = os.path.join(self.image_root, ref_image_name) 
63 |         tar_image_path = os.path.join(self.image_root, tar_image_name) 
64 | 
65 |         # Read Image and Mask
66 |         ref_image = cv2.imread(ref_image_path)
67 |         ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
68 | 
69 |         tar_image = cv2.imread(tar_image_path)
70 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
71 | 
72 |         ref_mask = mask_utils.decode(masks[start_frame_index])
73 |         tar_mask = mask_utils.decode(masks[end_frame_index])
74 | 
75 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
76 |         sampled_time_steps = self.sample_timestep()
77 |         item_with_collage['time_steps'] = sampled_time_steps
78 |         return item_with_collage
79 | 
80 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/uvo_val.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | from pycocotools import mask as mask_utils
11 | 
12 | class UVOValDataset(BaseDataset):
13 |     def __init__(self, image_dir, video_json, image_json):
14 |         json_path = video_json
15 |         with open(json_path, 'r') as fcc_file:
16 |             data = json.load(fcc_file)
17 |         image_json_path = image_json
18 |         with open(image_json_path , 'r') as image_file:
19 |             video_dict = json.load(image_file)
20 |         self.image_root = image_dir
21 |         self.data = data['annotations']
22 |         self.video_dict = video_dict
23 |         self.size = (512,512)
24 |         self.clip_size = (224,224)
25 |         self.dynamic = 1
26 | 
27 |     def __len__(self):
28 |         return 8000
29 | 
30 |     def __getitem__(self, idx):
31 |         while(1):
32 |             idx = np.random.randint(0, len(self.data)-1)
33 |             try:
34 |                 item = self.get_sample(idx)
35 |                 return item
36 |             except:
37 |                 idx = np.random.randint(0, len(self.data)-1)
38 | 
39 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
40 |         pass_flag = True
41 |         H,W = image.shape[0], image.shape[1]
42 |         H,W = H * ratio, W * ratio
43 |         y1,y2,x1,x2 = yyxx
44 |         h,w = y2-y1,x2-x1
45 |         if mode == 'max':
46 |             if h > H and w > W:
47 |                 pass_flag = False
48 |         elif mode == 'min':
49 |             if h < H and w < W:
50 |                 pass_flag = False
51 |         return pass_flag
52 | 
53 |     def get_sample(self, idx):
54 |         ins_anno = self.data[idx]
55 |         video_id = str(ins_anno['video_id'])
56 | 
57 |         video_names = self.video_dict[video_id]
58 |         masks = ins_anno['segmentations']
59 |         frames = video_names
60 | 
61 |         # Sampling frames
62 |         min_interval = len(frames)  // 5
63 |         start_frame_index = np.random.randint(low=0, high=len(frames) - min_interval)
64 |         end_frame_index = start_frame_index + np.random.randint(min_interval,  len(frames) - start_frame_index )
65 |         end_frame_index = min(end_frame_index, len(frames) - 1)
66 | 
67 |         # Get image path
68 |         ref_image_name = frames[start_frame_index]
69 |         tar_image_name = frames[end_frame_index]
70 |         ref_image_path = os.path.join(self.image_root, ref_image_name) 
71 |         tar_image_path = os.path.join(self.image_root, tar_image_name) 
72 | 
73 |         # Read Image and Mask
74 |         ref_image = cv2.imread(ref_image_path)
75 |         ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
76 | 
77 |         tar_image = cv2.imread(tar_image_path)
78 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
79 | 
80 |         ref_mask = mask_utils.decode(masks[start_frame_index])
81 |         tar_mask = mask_utils.decode(masks[end_frame_index])
82 | 
83 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask)
84 |         sampled_time_steps = self.sample_timestep()
85 |         item_with_collage['time_steps'] = sampled_time_steps
86 |         return item_with_collage
87 | 
88 | 


--------------------------------------------------------------------------------
/models/anydoor/datasets/vitonhd.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy as np
 4 | import os
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import cv2
 8 | from .data_utils import * 
 9 | from .base import BaseDataset
10 | import albumentations as A
11 | 
12 | class VitonHDDataset(BaseDataset):
13 |     def __init__(self, image_dir):
14 |         self.image_root = image_dir
15 |         self.data = os.listdir(self.image_root)
16 |         self.size = (512,512)
17 |         self.clip_size = (224,224)
18 |         self.dynamic = 2
19 | 
20 |     def __len__(self):
21 |         return 20000
22 | 
23 |     def check_region_size(self, image, yyxx, ratio, mode = 'max'):
24 |         pass_flag = True
25 |         H,W = image.shape[0], image.shape[1]
26 |         H,W = H * ratio, W * ratio
27 |         y1,y2,x1,x2 = yyxx
28 |         h,w = y2-y1,x2-x1
29 |         if mode == 'max':
30 |             if h > H and w > W:
31 |                 pass_flag = False
32 |         elif mode == 'min':
33 |             if h < H and w < W:
34 |                 pass_flag = False
35 |         return pass_flag
36 |             
37 |     def get_sample(self, idx):
38 | 
39 |         ref_image_path = os.path.join(self.image_root, self.data[idx])
40 |         tar_image_path = ref_image_path.replace('/cloth/', '/image/')
41 |         ref_mask_path = ref_image_path.replace('/cloth/','/cloth-mask/')
42 |         tar_mask_path = ref_image_path.replace('/cloth/', '/image-parse-v3/').replace('.jpg','.png')
43 | 
44 |         # Read Image and Mask
45 |         ref_image = cv2.imread(ref_image_path)
46 |         ref_image = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
47 | 
48 |         tar_image = cv2.imread(tar_image_path)
49 |         tar_image = cv2.cvtColor(tar_image, cv2.COLOR_BGR2RGB)
50 | 
51 |         ref_mask = (cv2.imread(ref_mask_path) > 128).astype(np.uint8)[:,:,0]
52 | 
53 |         tar_mask = Image.open(tar_mask_path ).convert('P')
54 |         tar_mask= np.array(tar_mask)
55 |         tar_mask = tar_mask == 5
56 | 
57 |         item_with_collage = self.process_pairs(ref_image, ref_mask, tar_image, tar_mask, max_ratio = 1.0)
58 |         sampled_time_steps = self.sample_timestep()
59 |         item_with_collage['time_steps'] = sampled_time_steps
60 |         return item_with_collage
61 | 
62 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 |       - 'gh/**'
11 | 
12 | jobs:
13 |   run-linters:
14 |     name: Run linters
15 |     runs-on: ubuntu-20.04
16 | 
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v3
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: 3.9
24 |           cache: 'pip'
25 |           cache-dependency-path: '**/requirements*.txt'
26 |       - name: Install Python (development) dependencies
27 |         run: |
28 |           pip install -r requirements-dev.txt
29 |       - name: Run flake8
30 |         run: |
31 |           flake8
32 |       - name: Run black
33 |         if: always()
34 |         run: |
35 |           black --check dinov2
36 |       - name: Run pylint
37 |         if: always()
38 |         run: |
39 |           pylint --exit-zero dinov2
40 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | *.egg-info/
 4 | **/__pycache__/
 5 | 
 6 | **/.ipynb_checkpoints
 7 | **/.ipynb_checkpoints/**
 8 | 
 9 | **/notebooks
10 | 
11 | *.swp
12 | 
13 | .vscode/
14 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to DINOv2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to DINOv2, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: dinov2
 2 | channels:
 3 |   - defaults
 4 |   - pytorch
 5 |   - nvidia
 6 |   - xformers
 7 |   - conda-forge
 8 | dependencies:
 9 |   - python=3.9
10 |   - pytorch::pytorch=2.0.0
11 |   - pytorch::pytorch-cuda=11.7.0
12 |   - pytorch::torchvision=0.15.0
13 |   - omegaconf
14 |   - torchmetrics=0.10.3
15 |   - fvcore
16 |   - iopath
17 |   - xformers::xformers=0.0.18
18 |   - pip
19 |   - pip:
20 |     - git+https://github.com/facebookincubator/submitit
21 |     - --extra-index-url https://pypi.nvidia.com
22 |     - cuml-cu11
23 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | __version__ = "0.0.1"
8 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import pathlib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | 
12 | def load_config(config_name: str):
13 |     config_filename = config_name + ".yaml"
14 |     return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
15 | 
16 | 
17 | dinov2_default_config = load_config("ssl_default_config")
18 | 
19 | 
20 | def load_and_merge_config(config_name: str):
21 |     default_config = OmegaConf.create(dinov2_default_config)
22 |     loaded_config = load_config(config_name)
23 |     return OmegaConf.merge(default_config, loaded_config)
24 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/eval/vitb14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_base
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/eval/vitg14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_giant2
3 |   patch_size: 14
4 |   ffn_layer: swiglufused
5 | crops:
6 |   global_crops_size: 518  # this is to set up the position embeddings properly
7 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/eval/vitl14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_large
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/eval/vits14_pretrain.yaml:
--------------------------------------------------------------------------------
1 | student:
2 |   arch: vit_small
3 |   patch_size: 14
4 | crops:
5 |   global_crops_size: 518  # this is to set up the position embeddings properly
6 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/ssl_default_config.yaml:
--------------------------------------------------------------------------------
  1 | MODEL:
  2 |   WEIGHTS: ''
  3 | compute_precision:
  4 |   grad_scaler: true
  5 |   teacher:
  6 |     backbone:
  7 |       sharding_strategy: SHARD_GRAD_OP
  8 |       mixed_precision:
  9 |         param_dtype: fp16
 10 |         reduce_dtype: fp16
 11 |         buffer_dtype: fp32
 12 |     dino_head:
 13 |       sharding_strategy: SHARD_GRAD_OP
 14 |       mixed_precision:
 15 |         param_dtype: fp16
 16 |         reduce_dtype: fp16
 17 |         buffer_dtype: fp32
 18 |     ibot_head:
 19 |       sharding_strategy: SHARD_GRAD_OP
 20 |       mixed_precision:
 21 |         param_dtype: fp16
 22 |         reduce_dtype: fp16
 23 |         buffer_dtype: fp32
 24 |   student:
 25 |     backbone:
 26 |       sharding_strategy: SHARD_GRAD_OP
 27 |       mixed_precision:
 28 |         param_dtype: fp16
 29 |         reduce_dtype: fp16
 30 |         buffer_dtype: fp32
 31 |     dino_head:
 32 |       sharding_strategy: SHARD_GRAD_OP
 33 |       mixed_precision:
 34 |         param_dtype: fp16
 35 |         reduce_dtype: fp32
 36 |         buffer_dtype: fp32
 37 |     ibot_head:
 38 |       sharding_strategy: SHARD_GRAD_OP
 39 |       mixed_precision:
 40 |         param_dtype: fp16
 41 |         reduce_dtype: fp32
 42 |         buffer_dtype: fp32
 43 | dino:
 44 |   loss_weight: 1.0
 45 |   head_n_prototypes: 65536
 46 |   head_bottleneck_dim: 256
 47 |   head_nlayers: 3
 48 |   head_hidden_dim: 2048
 49 |   koleo_loss_weight: 0.1
 50 | ibot:
 51 |   loss_weight: 1.0
 52 |   mask_sample_probability: 0.5
 53 |   mask_ratio_min_max:
 54 |   - 0.1
 55 |   - 0.5
 56 |   separate_head: false
 57 |   head_n_prototypes: 65536
 58 |   head_bottleneck_dim: 256
 59 |   head_nlayers: 3
 60 |   head_hidden_dim: 2048
 61 | train:
 62 |   batch_size_per_gpu: 64
 63 |   dataset_path: ImageNet:split=TRAIN
 64 |   output_dir: .
 65 |   saveckp_freq: 20
 66 |   seed: 0
 67 |   num_workers: 10
 68 |   OFFICIAL_EPOCH_LENGTH: 1250
 69 |   cache_dataset: true
 70 |   centering: "centering" # or "sinkhorn_knopp"
 71 | student:
 72 |   arch: vit_large
 73 |   patch_size: 16
 74 |   drop_path_rate: 0.3
 75 |   layerscale: 1.0e-05
 76 |   drop_path_uniform: true
 77 |   pretrained_weights: ''
 78 |   ffn_layer: "mlp"
 79 |   block_chunks: 0
 80 |   qkv_bias: true
 81 |   proj_bias: true
 82 |   ffn_bias: true
 83 | teacher:
 84 |   momentum_teacher: 0.992
 85 |   final_momentum_teacher: 1
 86 |   warmup_teacher_temp: 0.04
 87 |   teacher_temp: 0.07
 88 |   warmup_teacher_temp_epochs: 30
 89 | optim:
 90 |   epochs: 100
 91 |   weight_decay: 0.04
 92 |   weight_decay_end: 0.4
 93 |   base_lr: 0.004  # learning rate for a batch size of 1024
 94 |   lr: 0.  # will be set after applying scaling rule
 95 |   warmup_epochs: 10
 96 |   min_lr: 1.0e-06
 97 |   clip_grad: 3.0
 98 |   freeze_last_layer_epochs: 1
 99 |   scaling_rule: sqrt_wrt_1024
100 |   patch_embed_lr_mult: 0.2
101 |   layerwise_decay: 0.9
102 |   adamw_beta1: 0.9
103 |   adamw_beta2: 0.999
104 | crops:
105 |   global_crops_scale:
106 |   - 0.32
107 |   - 1.0
108 |   local_crops_number: 8
109 |   local_crops_scale:
110 |   - 0.05
111 |   - 0.32
112 |   global_crops_size: 224
113 |   local_crops_size: 96
114 | evaluation:
115 |   eval_period_iterations: 12500
116 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/train/vitg14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 12
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_giant2
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/train/vitl14.yaml:
--------------------------------------------------------------------------------
 1 | dino:
 2 |   head_n_prototypes: 131072
 3 |   head_bottleneck_dim: 384
 4 | ibot:
 5 |   separate_head: true
 6 |   head_n_prototypes: 131072
 7 | train:
 8 |   batch_size_per_gpu: 32
 9 |   dataset_path: ImageNet22k
10 |   centering: sinkhorn_knopp
11 | student:
12 |   arch: vit_large
13 |   patch_size: 14
14 |   drop_path_rate: 0.4
15 |   ffn_layer: swiglufused
16 |   block_chunks: 4
17 | teacher:
18 |   momentum_teacher: 0.994
19 | optim:
20 |   epochs: 500
21 |   weight_decay_end: 0.2
22 |   base_lr: 2.0e-04  # learning rate for a batch size of 1024
23 |   warmup_epochs: 80
24 |   layerwise_decay: 1.0
25 | crops:
26 |   local_crops_size: 98


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/configs/train/vitl16_short.yaml:
--------------------------------------------------------------------------------
1 | # this corresponds to the default config
2 | train:
3 |   dataset_path: ImageNet:split=TRAIN
4 |   batch_size_per_gpu: 64
5 | student:
6 |   block_chunks: 4
7 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .adapters import DatasetWithEnumeratedTargets
 8 | from .loaders import make_data_loader, make_dataset, SamplerType
 9 | from .collate import collate_data_and_cast
10 | from .masking import MaskingGenerator
11 | from .augmentations import DataAugmentationDINO
12 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/adapters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Any, Tuple
 8 | 
 9 | from torch.utils.data import Dataset
10 | 
11 | 
12 | class DatasetWithEnumeratedTargets(Dataset):
13 |     def __init__(self, dataset):
14 |         self._dataset = dataset
15 | 
16 |     def get_image_data(self, index: int) -> bytes:
17 |         return self._dataset.get_image_data(index)
18 | 
19 |     def get_target(self, index: int) -> Tuple[Any, int]:
20 |         target = self._dataset.get_target(index)
21 |         return (index, target)
22 | 
23 |     def get_sample_decoder(self, index: int) -> Any:
24 |         return self._dataset.get_sample_decoder(index)
25 | 
26 |     def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
27 |         image, target = self._dataset[index]
28 |         target = index if target is None else target
29 |         return image, (index, target)
30 | 
31 |     def __len__(self) -> int:
32 |         return len(self._dataset)
33 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/collate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import random
 9 | 
10 | 
11 | def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
12 |     # dtype = torch.half  # TODO: Remove
13 | 
14 |     n_global_crops = len(samples_list[0][0]["global_crops"])
15 |     n_local_crops = len(samples_list[0][0]["local_crops"])
16 | 
17 |     collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
18 | 
19 |     collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
20 | 
21 |     B = len(collated_global_crops)
22 |     N = n_tokens
23 |     n_samples_masked = int(B * mask_probability)
24 |     probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
25 |     upperbound = 0
26 |     masks_list = []
27 |     for i in range(0, n_samples_masked):
28 |         prob_min = probs[i]
29 |         prob_max = probs[i + 1]
30 |         masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
31 |         upperbound += int(N * prob_max)
32 |     for i in range(n_samples_masked, B):
33 |         masks_list.append(torch.BoolTensor(mask_generator(0)))
34 | 
35 |     random.shuffle(masks_list)
36 | 
37 |     collated_masks = torch.stack(masks_list).flatten(1)
38 |     mask_indices_list = collated_masks.flatten().nonzero().flatten()
39 | 
40 |     masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
41 | 
42 |     return {
43 |         "collated_global_crops": collated_global_crops.to(dtype),
44 |         "collated_local_crops": collated_local_crops.to(dtype),
45 |         "collated_masks": collated_masks,
46 |         "mask_indices_list": mask_indices_list,
47 |         "masks_weight": masks_weight,
48 |         "upperbound": upperbound,
49 |         "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
50 |     }
51 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | from .image_net import ImageNet
8 | from .image_net_22k import ImageNet22k
9 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/datasets/decoders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from io import BytesIO
 8 | from typing import Any, Tuple
 9 | 
10 | from PIL import Image
11 | 
12 | 
13 | class Decoder:
14 |     def decode(self) -> Any:
15 |         raise NotImplementedError
16 | 
17 | 
18 | class ImageDataDecoder(Decoder):
19 |     def __init__(self, image_data: bytes) -> None:
20 |         self._image_data = image_data
21 | 
22 |     def decode(self) -> Image:
23 |         f = BytesIO(self._image_data)
24 |         return Image.open(f).convert(mode="RGB")
25 | 
26 | 
27 | class TargetDecoder(Decoder):
28 |     def __init__(self, target: Any):
29 |         self._target = target
30 | 
31 |     def decode(self) -> Any:
32 |         return self._target
33 | 
34 | 
35 | class TupleDecoder(Decoder):
36 |     def __init__(self, *decoders: Decoder):
37 |         self._decoders: Tuple[Decoder, ...] = decoders
38 | 
39 |     def decode(self) -> Any:
40 |         return (decoder.decode() for decoder in self._decoders)
41 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/datasets/extended.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Any, Tuple
 8 | 
 9 | from torchvision.datasets import VisionDataset
10 | 
11 | from .decoders import Decoder, TargetDecoder, ImageDataDecoder, TupleDecoder
12 | 
13 | 
14 | class ExtendedVisionDataset(VisionDataset):
15 |     def __init__(self, *args, **kwargs) -> None:
16 |         super().__init__(*args, **kwargs)  # type: ignore
17 | 
18 |     def get_image_data(self, index: int) -> bytes:
19 |         raise NotImplementedError
20 | 
21 |     def get_target(self, index: int) -> Any:
22 |         raise NotImplementedError
23 | 
24 |     def __getitem__(self, index: int) -> Tuple[Any, Any]:
25 |         try:
26 |             image_data = self.get_image_data(index)
27 |             image = ImageDataDecoder(image_data).decode()
28 |         except Exception as e:
29 |             raise RuntimeError(f"can not read image for sample {index}") from e
30 |         target = self.get_target(index)
31 |         target = TargetDecoder(target).decode()
32 | 
33 |         if self.transforms is not None:
34 |             image, target = self.transforms(image, target)
35 | 
36 |         return image, target
37 | 
38 |     def get_sample_decoder(self, index: int) -> Decoder:
39 |         image_data = self.get_image_data(index)
40 |         target = self.get_target(index)
41 |         return TupleDecoder(
42 |             ImageDataDecoder(image_data),
43 |             TargetDecoder(target),
44 |         )
45 | 
46 |     def __len__(self) -> int:
47 |         raise NotImplementedError
48 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/data/masking.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import random
 8 | import math
 9 | import numpy as np
10 | 
11 | 
12 | class MaskingGenerator:
13 |     def __init__(
14 |         self,
15 |         input_size,
16 |         num_masking_patches=None,
17 |         min_num_patches=4,
18 |         max_num_patches=None,
19 |         min_aspect=0.3,
20 |         max_aspect=None,
21 |     ):
22 |         if not isinstance(input_size, tuple):
23 |             input_size = (input_size,) * 2
24 |         self.height, self.width = input_size
25 | 
26 |         self.num_patches = self.height * self.width
27 |         self.num_masking_patches = num_masking_patches
28 | 
29 |         self.min_num_patches = min_num_patches
30 |         self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
31 | 
32 |         max_aspect = max_aspect or 1 / min_aspect
33 |         self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
34 | 
35 |     def __repr__(self):
36 |         repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
37 |             self.height,
38 |             self.width,
39 |             self.min_num_patches,
40 |             self.max_num_patches,
41 |             self.num_masking_patches,
42 |             self.log_aspect_ratio[0],
43 |             self.log_aspect_ratio[1],
44 |         )
45 |         return repr_str
46 | 
47 |     def get_shape(self):
48 |         return self.height, self.width
49 | 
50 |     def _mask(self, mask, max_mask_patches):
51 |         delta = 0
52 |         for _ in range(10):
53 |             target_area = random.uniform(self.min_num_patches, max_mask_patches)
54 |             aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
55 |             h = int(round(math.sqrt(target_area * aspect_ratio)))
56 |             w = int(round(math.sqrt(target_area / aspect_ratio)))
57 |             if w < self.width and h < self.height:
58 |                 top = random.randint(0, self.height - h)
59 |                 left = random.randint(0, self.width - w)
60 | 
61 |                 num_masked = mask[top : top + h, left : left + w].sum()
62 |                 # Overlap
63 |                 if 0 < h * w - num_masked <= max_mask_patches:
64 |                     for i in range(top, top + h):
65 |                         for j in range(left, left + w):
66 |                             if mask[i, j] == 0:
67 |                                 mask[i, j] = 1
68 |                                 delta += 1
69 | 
70 |                 if delta > 0:
71 |                     break
72 |         return delta
73 | 
74 |     def __call__(self, num_masking_patches=0):
75 |         mask = np.zeros(shape=self.get_shape(), dtype=bool)
76 |         mask_count = 0
77 |         while mask_count < num_masking_patches:
78 |             max_mask_patches = num_masking_patches - mask_count
79 |             max_mask_patches = min(max_mask_patches, self.max_num_patches)
80 | 
81 |             delta = self._mask(mask, max_mask_patches)
82 |             if delta == 0:
83 |                 break
84 |             else:
85 |                 mask_count += delta
86 | 
87 |         return mask
88 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/eval/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import argparse
 8 | from typing import Any, List, Optional, Tuple
 9 | 
10 | import torch
11 | import torch.backends.cudnn as cudnn
12 | 
13 | from dinov2.models import build_model_from_cfg
14 | from dinov2.utils.config import setup
15 | import dinov2.utils.utils as dinov2_utils
16 | 
17 | 
18 | def get_args_parser(
19 |     description: Optional[str] = None,
20 |     parents: Optional[List[argparse.ArgumentParser]] = [],
21 |     add_help: bool = True,
22 | ):
23 |     parser = argparse.ArgumentParser(
24 |         description=description,
25 |         parents=parents,
26 |         add_help=add_help,
27 |     )
28 |     parser.add_argument(
29 |         "--config-file",
30 |         type=str,
31 |         help="Model configuration file",
32 |     )
33 |     parser.add_argument(
34 |         "--pretrained-weights",
35 |         type=str,
36 |         help="Pretrained model weights",
37 |     )
38 |     parser.add_argument(
39 |         "--output-dir",
40 |         default="",
41 |         type=str,
42 |         help="Output directory to write results and logs",
43 |     )
44 |     parser.add_argument(
45 |         "--opts",
46 |         help="Extra configuration options",
47 |         default=[],
48 |         nargs="+",
49 |     )
50 |     return parser
51 | 
52 | 
53 | def get_autocast_dtype(config):
54 |     teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
55 |     if teacher_dtype_str == "fp16":
56 |         return torch.half
57 |     elif teacher_dtype_str == "bf16":
58 |         return torch.bfloat16
59 |     else:
60 |         return torch.float
61 | 
62 | 
63 | def build_model_for_eval(config, pretrained_weights):
64 |     model, _ = build_model_from_cfg(config, only_teacher=True)
65 |     dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
66 |     model.eval()
67 |     model.cuda()
68 |     return model
69 | 
70 | 
71 | def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
72 |     cudnn.benchmark = True
73 |     config = setup(args)
74 |     model = build_model_for_eval(config, args.pretrained_weights)
75 |     autocast_dtype = get_autocast_dtype(config)
76 |     return model, autocast_dtype
77 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .dino_head import DINOHead
 8 | from .mlp import Mlp
 9 | from .patch_embed import PatchEmbed
10 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
11 | from .block import NestedTensorBlock
12 | from .attention import MemEffAttention
13 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 | 
11 | import logging
12 | 
13 | from torch import Tensor
14 | from torch import nn
15 | 
16 | 
17 | logger = logging.getLogger("dinov2")
18 | 
19 | 
20 | try:
21 |     from xformers.ops import memory_efficient_attention, unbind, fmha
22 | 
23 |     XFORMERS_AVAILABLE = True
24 | except ImportError:
25 |     logger.warning("xFormers not available")
26 |     XFORMERS_AVAILABLE = False
27 | 
28 | 
29 | class Attention(nn.Module):
30 |     def __init__(
31 |         self,
32 |         dim: int,
33 |         num_heads: int = 8,
34 |         qkv_bias: bool = False,
35 |         proj_bias: bool = True,
36 |         attn_drop: float = 0.0,
37 |         proj_drop: float = 0.0,
38 |     ) -> None:
39 |         super().__init__()
40 |         self.num_heads = num_heads
41 |         head_dim = dim // num_heads
42 |         self.scale = head_dim**-0.5
43 | 
44 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45 |         self.attn_drop = nn.Dropout(attn_drop)
46 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
47 |         self.proj_drop = nn.Dropout(proj_drop)
48 | 
49 |     def forward(self, x: Tensor) -> Tensor:
50 |         B, N, C = x.shape
51 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52 | 
53 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54 |         attn = q @ k.transpose(-2, -1)
55 | 
56 |         attn = attn.softmax(dim=-1)
57 |         attn = self.attn_drop(attn)
58 | 
59 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60 |         x = self.proj(x)
61 |         x = self.proj_drop(x)
62 |         return x
63 | 
64 | 
65 | class MemEffAttention(Attention):
66 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67 |         if not XFORMERS_AVAILABLE:
68 |             assert attn_bias is None, "xFormers is required for nested tensors usage"
69 |             return super().forward(x)
70 | 
71 |         B, N, C = x.shape
72 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73 | 
74 |         q, k, v = unbind(qkv, 2)
75 | 
76 |         if attn_bias is not None:
77 |             self_att_op = fmha.MemoryEfficientAttentionFlashAttentionOp
78 |         else:
79 |             self_att_op = None
80 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=self_att_op)
81 |         x = x.reshape([B, N, C])
82 | 
83 |         x = self.proj(x)
84 |         x = self.proj_drop(x)
85 |         return x
86 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/dino_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | from torch.nn.init import trunc_normal_
10 | from torch.nn.utils import weight_norm
11 | 
12 | 
13 | class DINOHead(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_dim,
17 |         out_dim,
18 |         use_bn=False,
19 |         nlayers=3,
20 |         hidden_dim=2048,
21 |         bottleneck_dim=256,
22 |         mlp_bias=True,
23 |     ):
24 |         super().__init__()
25 |         nlayers = max(nlayers, 1)
26 |         self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
27 |         self.apply(self._init_weights)
28 |         self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
29 |         self.last_layer.weight_g.data.fill_(1)
30 | 
31 |     def _init_weights(self, m):
32 |         if isinstance(m, nn.Linear):
33 |             trunc_normal_(m.weight, std=0.02)
34 |             if isinstance(m, nn.Linear) and m.bias is not None:
35 |                 nn.init.constant_(m.bias, 0)
36 | 
37 |     def forward(self, x):
38 |         x = self.mlp(x)
39 |         eps = 1e-6 if x.dtype == torch.float16 else 1e-12
40 |         x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
41 |         x = self.last_layer(x)
42 |         return x
43 | 
44 | 
45 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
46 |     if nlayers == 1:
47 |         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
48 |     else:
49 |         layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
50 |         if use_bn:
51 |             layers.append(nn.BatchNorm1d(hidden_dim))
52 |         layers.append(nn.GELU())
53 |         for _ in range(nlayers - 2):
54 |             layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
55 |             if use_bn:
56 |                 layers.append(nn.BatchNorm1d(hidden_dim))
57 |             layers.append(nn.GELU())
58 |         layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
59 |         return nn.Sequential(*layers)
60 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | from torch import nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
20 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 |     if keep_prob > 0.0:
22 |         random_tensor.div_(keep_prob)
23 |     output = x * random_tensor
24 |     return output
25 | 
26 | 
27 | class DropPath(nn.Module):
28 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 | 
30 |     def __init__(self, drop_prob=None):
31 |         super(DropPath, self).__init__()
32 |         self.drop_prob = drop_prob
33 | 
34 |     def forward(self, x):
35 |         return drop_path(x, self.drop_prob, self.training)
36 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10 | 
11 | from typing import Callable, Optional, Tuple, Union
12 | 
13 | from torch import Tensor
14 | import torch.nn as nn
15 | 
16 | 
17 | def make_2tuple(x):
18 |     if isinstance(x, tuple):
19 |         assert len(x) == 2
20 |         return x
21 | 
22 |     assert isinstance(x, int)
23 |     return (x, x)
24 | 
25 | 
26 | class PatchEmbed(nn.Module):
27 |     """
28 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29 | 
30 |     Args:
31 |         img_size: Image size.
32 |         patch_size: Patch token size.
33 |         in_chans: Number of input image channels.
34 |         embed_dim: Number of linear projection output channels.
35 |         norm_layer: Normalization layer.
36 |     """
37 | 
38 |     def __init__(
39 |         self,
40 |         img_size: Union[int, Tuple[int, int]] = 224,
41 |         patch_size: Union[int, Tuple[int, int]] = 16,
42 |         in_chans: int = 3,
43 |         embed_dim: int = 768,
44 |         norm_layer: Optional[Callable] = None,
45 |         flatten_embedding: bool = True,
46 |     ) -> None:
47 |         super().__init__()
48 | 
49 |         image_HW = make_2tuple(img_size)
50 |         patch_HW = make_2tuple(patch_size)
51 |         patch_grid_size = (
52 |             image_HW[0] // patch_HW[0],
53 |             image_HW[1] // patch_HW[1],
54 |         )
55 | 
56 |         self.img_size = image_HW
57 |         self.patch_size = patch_HW
58 |         self.patches_resolution = patch_grid_size
59 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60 | 
61 |         self.in_chans = in_chans
62 |         self.embed_dim = embed_dim
63 | 
64 |         self.flatten_embedding = flatten_embedding
65 | 
66 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68 | 
69 |     def forward(self, x: Tensor) -> Tensor:
70 |         _, _, H, W = x.shape
71 |         patch_H, patch_W = self.patch_size
72 | 
73 |         assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74 |         assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75 | 
76 |         x = self.proj(x)  # B C H W
77 |         H, W = x.size(2), x.size(3)
78 |         x = x.flatten(2).transpose(1, 2)  # B HW C
79 |         x = self.norm(x)
80 |         if not self.flatten_embedding:
81 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
82 |         return x
83 | 
84 |     def flops(self) -> float:
85 |         Ho, Wo = self.patches_resolution
86 |         flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87 |         if self.norm is not None:
88 |             flops += Ho * Wo * self.embed_dim
89 |         return flops
90 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Callable, Optional
 8 | 
 9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 | 
12 | 
13 | class SwiGLUFFN(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         hidden_features: Optional[int] = None,
18 |         out_features: Optional[int] = None,
19 |         act_layer: Callable[..., nn.Module] = None,
20 |         drop: float = 0.0,
21 |         bias: bool = True,
22 |     ) -> None:
23 |         super().__init__()
24 |         out_features = out_features or in_features
25 |         hidden_features = hidden_features or in_features
26 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 | 
29 |     def forward(self, x: Tensor) -> Tensor:
30 |         x12 = self.w12(x)
31 |         x1, x2 = x12.chunk(2, dim=-1)
32 |         hidden = F.silu(x1) * x2
33 |         return self.w3(hidden)
34 | 
35 | 
36 | try:
37 |     from xformers.ops import SwiGLU
38 | 
39 |     XFORMERS_AVAILABLE = True
40 | except ImportError:
41 |     SwiGLU = SwiGLUFFN
42 |     XFORMERS_AVAILABLE = False
43 | 
44 | 
45 | class SwiGLUFFNFused(SwiGLU):
46 |     def __init__(
47 |         self,
48 |         in_features: int,
49 |         hidden_features: Optional[int] = None,
50 |         out_features: Optional[int] = None,
51 |         act_layer: Callable[..., nn.Module] = None,
52 |         drop: float = 0.0,
53 |         bias: bool = True,
54 |     ) -> None:
55 |         out_features = out_features or in_features
56 |         hidden_features = hidden_features or in_features
57 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 |         super().__init__(
59 |             in_features=in_features,
60 |             hidden_features=hidden_features,
61 |             out_features=out_features,
62 |             bias=bias,
63 |         )
64 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/loss/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .dino_clstoken_loss import DINOLoss
 8 | from .ibot_patch_loss import iBOTPatchLoss
 9 | from .koleo_loss import KoLeoLoss
10 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/loss/koleo_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | # import torch.distributed as dist
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class KoLeoLoss(nn.Module):
20 |     """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
21 | 
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.pdist = nn.PairwiseDistance(2, eps=1e-8)
25 | 
26 |     def pairwise_NNs_inner(self, x):
27 |         """
28 |         Pairwise nearest neighbors for L2-normalized vectors.
29 |         Uses Torch rather than Faiss to remain on GPU.
30 |         """
31 |         # parwise dot products (= inverse distance)
32 |         dots = torch.mm(x, x.t())
33 |         n = x.shape[0]
34 |         dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
35 |         # max inner prod -> min distance
36 |         _, I = torch.max(dots, dim=1)  # noqa: E741
37 |         return I
38 | 
39 |     def forward(self, student_output, eps=1e-8):
40 |         """
41 |         Args:
42 |             student_output (BxD): backbone output of student
43 |         """
44 |         with torch.cuda.amp.autocast(enabled=False):
45 |             student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
46 |             I = self.pairwise_NNs_inner(student_output)  # noqa: E741
47 |             distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
48 |             loss = -torch.log(distances + eps).mean()
49 |         return loss
50 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | 
 9 | from . import vision_transformer as vits
10 | 
11 | 
12 | logger = logging.getLogger("dinov2")
13 | 
14 | 
15 | def build_model(args, only_teacher=False, img_size=224):
16 |     args.arch = args.arch.removesuffix("_memeff")
17 |     if "vit" in args.arch:
18 |         vit_kwargs = dict(
19 |             img_size=img_size,
20 |             patch_size=args.patch_size,
21 |             init_values=args.layerscale,
22 |             ffn_layer=args.ffn_layer,
23 |             block_chunks=args.block_chunks,
24 |             qkv_bias=args.qkv_bias,
25 |             proj_bias=args.proj_bias,
26 |             ffn_bias=args.ffn_bias,
27 |         )
28 |         teacher = vits.__dict__[args.arch](**vit_kwargs)
29 |         if only_teacher:
30 |             return teacher, teacher.embed_dim
31 |         student = vits.__dict__[args.arch](
32 |             **vit_kwargs,
33 |             drop_path_rate=args.drop_path_rate,
34 |             drop_path_uniform=args.drop_path_uniform,
35 |         )
36 |         embed_dim = student.embed_dim
37 |     return student, teacher, embed_dim
38 | 
39 | 
40 | def build_model_from_cfg(cfg, only_teacher=False):
41 |     return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
42 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/run/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/run/eval/knn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.eval.knn import get_args_parser as get_knn_args_parser
12 | from dinov2.logging import setup_logging
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Evaluator:
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.eval.knn import main as knn_main
25 | 
26 |         self._setup_args()
27 |         knn_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 k-NN evaluation"
47 |     knn_args_parser = get_knn_args_parser(add_help=False)
48 |     parents = [knn_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Evaluator, args, name="dinov2:knn")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/run/eval/linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.eval.linear import get_args_parser as get_linear_args_parser
12 | from dinov2.logging import setup_logging
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Evaluator:
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.eval.linear import main as linear_main
25 | 
26 |         self._setup_args()
27 |         linear_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 linear evaluation"
47 |     linear_args_parser = get_linear_args_parser(add_help=False)
48 |     parents = [linear_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Evaluator, args, name="dinov2:linear")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/run/eval/log_regression.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
12 | from dinov2.logging import setup_logging
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Evaluator:
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.eval.log_regression import main as log_regression_main
25 | 
26 |         self._setup_args()
27 |         log_regression_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 logistic evaluation"
47 |     log_regression_args_parser = get_log_regression_args_parser(add_help=False)
48 |     parents = [log_regression_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Evaluator, args, name="dinov2:logreg")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/run/train/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | from dinov2.logging import setup_logging
12 | from dinov2.train import get_args_parser as get_train_args_parser
13 | from dinov2.run.submit import get_args_parser, submit_jobs
14 | 
15 | 
16 | logger = logging.getLogger("dinov2")
17 | 
18 | 
19 | class Trainer(object):
20 |     def __init__(self, args):
21 |         self.args = args
22 | 
23 |     def __call__(self):
24 |         from dinov2.train import main as train_main
25 | 
26 |         self._setup_args()
27 |         train_main(self.args)
28 | 
29 |     def checkpoint(self):
30 |         import submitit
31 | 
32 |         logger.info(f"Requeuing {self.args}")
33 |         empty = type(self)(self.args)
34 |         return submitit.helpers.DelayedSubmission(empty)
35 | 
36 |     def _setup_args(self):
37 |         import submitit
38 | 
39 |         job_env = submitit.JobEnvironment()
40 |         self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
41 |         logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
42 |         logger.info(f"Args: {self.args}")
43 | 
44 | 
45 | def main():
46 |     description = "Submitit launcher for DINOv2 training"
47 |     train_args_parser = get_train_args_parser(add_help=False)
48 |     parents = [train_args_parser]
49 |     args_parser = get_args_parser(description=description, parents=parents)
50 |     args = args_parser.parse_args()
51 | 
52 |     setup_logging()
53 | 
54 |     assert os.path.exists(args.config_file), "Configuration file does not exist!"
55 |     submit_jobs(Trainer, args, name="dinov2:train")
56 |     return 0
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     sys.exit(main())
61 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | from .train import get_args_parser, main
8 | from .ssl_meta_arch import SSLMetaArch
9 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/utils/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | import logging
 9 | import os
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | import dinov2.distributed as distributed
14 | from dinov2.logging import setup_logging
15 | from dinov2.utils import utils
16 | from dinov2.configs import dinov2_default_config
17 | 
18 | 
19 | logger = logging.getLogger("dinov2")
20 | 
21 | 
22 | def apply_scaling_rules_to_cfg(cfg):  # to fix
23 |     if cfg.optim.scaling_rule == "sqrt_wrt_1024":
24 |         base_lr = cfg.optim.base_lr
25 |         cfg.optim.lr = base_lr
26 |         cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
27 |         logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
28 |     else:
29 |         raise NotImplementedError
30 |     return cfg
31 | 
32 | 
33 | def write_config(cfg, output_dir, name="config.yaml"):
34 |     logger.info(OmegaConf.to_yaml(cfg))
35 |     saved_cfg_path = os.path.join(output_dir, name)
36 |     with open(saved_cfg_path, "w") as f:
37 |         OmegaConf.save(config=cfg, f=f)
38 |     return saved_cfg_path
39 | 
40 | 
41 | def get_cfg_from_args(args):
42 |     args.output_dir = os.path.abspath(args.output_dir)
43 |     args.opts += [f"train.output_dir={args.output_dir}"]
44 |     default_cfg = OmegaConf.create(dinov2_default_config)
45 |     cfg = OmegaConf.load(args.config_file)
46 |     cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
47 |     return cfg
48 | 
49 | 
50 | def default_setup(args):
51 |     distributed.enable(overwrite=True)
52 |     seed = getattr(args, "seed", 0)
53 |     rank = distributed.get_global_rank()
54 | 
55 |     global logger
56 |     setup_logging(output=args.output_dir, level=logging.INFO)
57 |     logger = logging.getLogger("dinov2")
58 | 
59 |     utils.fix_random_seeds(seed + rank)
60 |     logger.info("git:\n  {}\n".format(utils.get_sha()))
61 |     logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
62 | 
63 | 
64 | def setup(args):
65 |     """
66 |     Create configs and perform basic setups.
67 |     """
68 |     cfg = get_cfg_from_args(args)
69 |     os.makedirs(args.output_dir, exist_ok=True)
70 |     default_setup(args)
71 |     apply_scaling_rules_to_cfg(cfg)
72 |     write_config(cfg, args.output_dir)
73 |     return cfg
74 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/dinov2/utils/dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | from typing import Dict, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | 
13 | 
14 | TypeSpec = Union[str, np.dtype, torch.dtype]
15 | 
16 | 
17 | _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
18 |     np.dtype("bool"): torch.bool,
19 |     np.dtype("uint8"): torch.uint8,
20 |     np.dtype("int8"): torch.int8,
21 |     np.dtype("int16"): torch.int16,
22 |     np.dtype("int32"): torch.int32,
23 |     np.dtype("int64"): torch.int64,
24 |     np.dtype("float16"): torch.float16,
25 |     np.dtype("float32"): torch.float32,
26 |     np.dtype("float64"): torch.float64,
27 |     np.dtype("complex64"): torch.complex64,
28 |     np.dtype("complex128"): torch.complex128,
29 | }
30 | 
31 | 
32 | def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
33 |     if isinstance(dtype, torch.dtype):
34 |         return dtype
35 |     if isinstance(dtype, str):
36 |         dtype = np.dtype(dtype)
37 |     assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
38 |     return _NUMPY_TO_TORCH_DTYPE[dtype]
39 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | 
 4 | [tool.pylint.master]
 5 | persistent = false
 6 | score = false
 7 | 
 8 | [tool.pylint.messages_control]
 9 | disable = "all"
10 | enable = [
11 |   "miscellaneous",
12 |   "similarities",
13 | ]
14 | 
15 | [tool.pylint.similarities]
16 | ignore-comments = true
17 | ignore-docstrings = true
18 | ignore-imports = true
19 | min-similarity-lines = 8
20 | 
21 | [tool.pylint.reports]
22 | reports = false
23 | 
24 | [tool.pylint.miscellaneous]
25 | notes = [
26 |   "FIXME",
27 |   "XXX",
28 |   "TODO",
29 | ]
30 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black==22.6.0
2 | flake8==5.0.4
3 | pylint==2.15.0
4 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu117
 2 | torch==2.0.0
 3 | torchvision==0.15.0
 4 | omegaconf
 5 | torchmetrics==0.10.3
 6 | fvcore
 7 | iopath
 8 | xformers==0.0.18
 9 | submitit
10 | --extra-index-url https://pypi.nvidia.com
11 | cuml-cu11
12 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/scripts/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ -n "$1" ]; then
 4 |   echo "linting \"$1\""
 5 | fi
 6 | 
 7 | echo "running black"
 8 | if [ -n "$1" ]; then
 9 |   black "$1"
10 | else
11 |   black dinov2
12 | fi
13 | 
14 | echo "running flake8"
15 | if [ -n "$1" ]; then
16 |   flake8 "$1"
17 | else
18 |   flake8
19 | fi
20 | 
21 | echo "running pylint"
22 | if [ -n "$1" ]; then
23 |   pylint "$1"
24 | else
25 |   pylint dinov2
26 | fi
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E203,E501,W503
4 | per-file-ignores =
5 |   __init__.py:F401
6 | 


--------------------------------------------------------------------------------
/models/anydoor/dinov2/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from pathlib import Path
 8 | import re
 9 | from typing import List, Tuple
10 | 
11 | from setuptools import setup, find_packages
12 | 
13 | 
14 | NAME = "dinov2"
15 | DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
16 | 
17 | URL = "https://github.com/facebookresearch/dinov2"
18 | AUTHOR = "FAIR"
19 | REQUIRES_PYTHON = ">=3.9.0"
20 | HERE = Path(__file__).parent
21 | 
22 | 
23 | try:
24 |     with open(HERE / "README.md", encoding="utf-8") as f:
25 |         long_description = "\n" + f.read()
26 | except FileNotFoundError:
27 |     long_description = DESCRIPTION
28 | 
29 | 
30 | def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
31 |     requirements = []
32 |     extra_indices = []
33 |     with open(path) as f:
34 |         for line in f.readlines():
35 |             line = line.rstrip("\r\n")
36 |             if line.startswith("--extra-index-url "):
37 |                 extra_indices.append(line[18:])
38 |                 continue
39 |             requirements.append(line)
40 |     return requirements, extra_indices
41 | 
42 | 
43 | def get_package_version() -> str:
44 |     with open(HERE / "dinov2/__init__.py") as f:
45 |         result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
46 |         if result:
47 |             return result.group(1)
48 |     raise RuntimeError("Can't get package version")
49 | 
50 | 
51 | requirements, extra_indices = get_requirements()
52 | version = get_package_version()
53 | dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
54 | 
55 | 
56 | setup(
57 |     name=NAME,
58 |     version=version,
59 |     description=DESCRIPTION,
60 |     long_description=long_description,
61 |     long_description_content_type="text/markdown",
62 |     author=AUTHOR,
63 |     python_requires=REQUIRES_PYTHON,
64 |     url=URL,
65 |     packages=find_packages(),
66 |     package_data={
67 |         "": ["*.yaml"],
68 |     },
69 |     install_requires=requirements,
70 |     dependency_links=extra_indices,
71 |     extras_require={
72 |         "dev": dev_requirements,
73 |     },
74 |     install_package_data=True,
75 |     license="CC-BY-NC",
76 |     license_files=("LICENSE",),
77 |     classifiers=[
78 |         # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
79 |         "Development Status :: 3 - Alpha",
80 |         "Intended Audience :: Developers",
81 |         "Intended Audience :: Science/Research",
82 |         "License :: Other/Proprietary License",
83 |         "Programming Language :: Python :: 3.9",
84 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
85 |         "Topic :: Software Development :: Libraries :: Python Modules",
86 |     ],
87 | )
88 | 


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/00.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/01.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/02.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/03.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/04.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/04.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/06.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/07.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/08.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/13.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/17.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/BG/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/BG/22.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/00.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/01.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/04.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/06.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/07.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/09.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/18.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/22.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/25.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/28.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/33.png


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/36.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/36.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/39.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/39.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/43.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/43.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/44.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/44.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/Gradio/FG/50.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/Gradio/FG/50.jpg


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/BG/000000047948_GT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000047948_GT.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/BG/000000047948_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000047948_mask.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/BG/000000309203_GT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000309203_GT.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/BG/000000309203_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/BG/000000309203_mask.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/FG/00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/00.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/FG/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/01.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/FG/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/02.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/FG/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/FG/03.png


--------------------------------------------------------------------------------
/models/anydoor/examples/TestDreamBooth/GEN/gen_res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/examples/TestDreamBooth/GEN/gen_res.png


--------------------------------------------------------------------------------
/models/anydoor/iseg/coarse_mask_refine.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/iseg/coarse_mask_refine.pth


--------------------------------------------------------------------------------
/models/anydoor/ldm/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/data/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/data/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from ldm.modules.midas.api import load_midas_transform
 4 | 
 5 | 
 6 | class AddMiDaS(object):
 7 |     def __init__(self, model_type):
 8 |         super().__init__()
 9 |         self.transform = load_midas_transform(model_type)
10 | 
11 |     def pt2np(self, x):
12 |         x = ((x + 1.0) * .5).detach().cpu().numpy()
13 |         return x
14 | 
15 |     def np2pt(self, x):
16 |         x = torch.from_numpy(x) * 2 - 1.
17 |         return x
18 | 
19 |     def __call__(self, sample):
20 |         # sample['jpg'] is tensor hwc in [-1, 1] at this point
21 |         x = self.pt2np(sample['jpg'])
22 |         x = self.transform({"image": x})["image"]
23 |         sample['midas_in'] = x
24 |         return sample


--------------------------------------------------------------------------------
/models/anydoor/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/models/diffusion/dpm_solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import DPMSolverSampler


--------------------------------------------------------------------------------
/models/anydoor/ldm/models/diffusion/dpm_solver/sampler.py:
--------------------------------------------------------------------------------
 1 | """SAMPLING ONLY."""
 2 | import torch
 3 | 
 4 | from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
 5 | 
 6 | 
 7 | MODEL_TYPES = {
 8 |     "eps": "noise",
 9 |     "v": "v"
10 | }
11 | 
12 | 
13 | class DPMSolverSampler(object):
14 |     def __init__(self, model, **kwargs):
15 |         super().__init__()
16 |         self.model = model
17 |         to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
18 |         self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod))
19 | 
20 |     def register_buffer(self, name, attr):
21 |         if type(attr) == torch.Tensor:
22 |             if attr.device != torch.device("cuda"):
23 |                 attr = attr.to(torch.device("cuda"))
24 |         setattr(self, name, attr)
25 | 
26 |     @torch.no_grad()
27 |     def sample(self,
28 |                S,
29 |                batch_size,
30 |                shape,
31 |                conditioning=None,
32 |                callback=None,
33 |                normals_sequence=None,
34 |                img_callback=None,
35 |                quantize_x0=False,
36 |                eta=0.,
37 |                mask=None,
38 |                x0=None,
39 |                temperature=1.,
40 |                noise_dropout=0.,
41 |                score_corrector=None,
42 |                corrector_kwargs=None,
43 |                verbose=True,
44 |                x_T=None,
45 |                log_every_t=100,
46 |                unconditional_guidance_scale=1.,
47 |                unconditional_conditioning=None,
48 |                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
49 |                **kwargs
50 |                ):
51 |         if conditioning is not None:
52 |             if isinstance(conditioning, dict):
53 |                 cbs = conditioning[list(conditioning.keys())[0]].shape[0]
54 |                 if cbs != batch_size:
55 |                     print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
56 |             else:
57 |                 if conditioning.shape[0] != batch_size:
58 |                     print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
59 | 
60 |         # sampling
61 |         C, H, W = shape
62 |         size = (batch_size, C, H, W)
63 | 
64 |         print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
65 | 
66 |         device = self.model.betas.device
67 |         if x_T is None:
68 |             img = torch.randn(size, device=device)
69 |         else:
70 |             img = x_T
71 | 
72 |         ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
73 | 
74 |         model_fn = model_wrapper(
75 |             lambda x, t, c: self.model.apply_model(x, t, c),
76 |             ns,
77 |             model_type=MODEL_TYPES[self.model.parameterization],
78 |             guidance_type="classifier-free",
79 |             condition=conditioning,
80 |             unconditional_condition=unconditional_conditioning,
81 |             guidance_scale=unconditional_guidance_scale,
82 |         )
83 | 
84 |         dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
85 |         x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True)
86 | 
87 |         return x.to(device), None


--------------------------------------------------------------------------------
/models/anydoor/ldm/models/diffusion/sampling_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def append_dims(x, target_dims):
 6 |     """Appends dimensions to the end of a tensor until it has target_dims dimensions.
 7 |     From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
 8 |     dims_to_append = target_dims - x.ndim
 9 |     if dims_to_append < 0:
10 |         raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11 |     return x[(...,) + (None,) * dims_to_append]
12 | 
13 | 
14 | def norm_thresholding(x0, value):
15 |     s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
16 |     return x0 * (value / s)
17 | 
18 | 
19 | def spatial_norm_thresholding(x0, value):
20 |     # b c h w
21 |     s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
22 |     return x0 * (value / s)


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/midas/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/midas/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/anydoor/ldm/modules/midas/midas/__init__.py


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/midas/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/models/anydoor/ldm/modules/midas/midas/midas_net.py:
--------------------------------------------------------------------------------
 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
 2 | This file contains code that is adapted from
 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from .base_model import BaseModel
 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10 | 
11 | 
12 | class MidasNet(BaseModel):
13 |     """Network for monocular depth estimation.
14 |     """
15 | 
16 |     def __init__(self, path=None, features=256, non_negative=True):
17 |         """Init.
18 | 
19 |         Args:
20 |             path (str, optional): Path to saved model. Defaults to None.
21 |             features (int, optional): Number of features. Defaults to 256.
22 |             backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23 |         """
24 |         print("Loading weights: ", path)
25 | 
26 |         super(MidasNet, self).__init__()
27 | 
28 |         use_pretrained = False if path is None else True
29 | 
30 |         self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31 | 
32 |         self.scratch.refinenet4 = FeatureFusionBlock(features)
33 |         self.scratch.refinenet3 = FeatureFusionBlock(features)
34 |         self.scratch.refinenet2 = FeatureFusionBlock(features)
35 |         self.scratch.refinenet1 = FeatureFusionBlock(features)
36 | 
37 |         self.scratch.output_conv = nn.Sequential(
38 |             nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39 |             Interpolate(scale_factor=2, mode="bilinear"),
40 |             nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41 |             nn.ReLU(True),
42 |             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43 |             nn.ReLU(True) if non_negative else nn.Identity(),
44 |         )
45 | 
46 |         if path:
47 |             self.load(path)
48 | 
49 |     def forward(self, x):
50 |         """Forward pass.
51 | 
52 |         Args:
53 |             x (tensor): input data (image)
54 | 
55 |         Returns:
56 |             tensor: depth
57 |         """
58 | 
59 |         layer_1 = self.pretrained.layer1(x)
60 |         layer_2 = self.pretrained.layer2(layer_1)
61 |         layer_3 = self.pretrained.layer3(layer_2)
62 |         layer_4 = self.pretrained.layer4(layer_3)
63 | 
64 |         layer_1_rn = self.scratch.layer1_rn(layer_1)
65 |         layer_2_rn = self.scratch.layer2_rn(layer_2)
66 |         layer_3_rn = self.scratch.layer3_rn(layer_3)
67 |         layer_4_rn = self.scratch.layer4_rn(layer_4)
68 | 
69 |         path_4 = self.scratch.refinenet4(layer_4_rn)
70 |         path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71 |         path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72 |         path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73 | 
74 |         out = self.scratch.output_conv(path_1)
75 | 
76 |         return torch.squeeze(out, dim=1)
77 | 


--------------------------------------------------------------------------------
/models/anydoor/requirements.txt:
--------------------------------------------------------------------------------
 1 | albumentations==1.3.0
 2 | einops==0.3.0
 3 | fvcore==0.1.5.post20221221
 4 | gradio==3.39.0
 5 | numpy==1.23.1
 6 | omegaconf==2.1.1
 7 | open_clip_torch==2.17.1
 8 | opencv_contrib_python==4.3.0.36
 9 | opencv_python==4.7.0.72
10 | opencv_python_headless==4.7.0.72
11 | Pillow==9.4.0
12 | pytorch_lightning==1.5.0
13 | safetensors==0.2.7
14 | scipy==1.9.1
15 | setuptools==66.0.0
16 | share==1.0.4
17 | submitit==1.5.1
18 | timm==0.6.12
19 | torch==2.0.0
20 | torchmetrics==0.6.0
21 | tqdm==4.65.0
22 | transformers==4.19.2
23 | xformers==0.0.18
24 | 


--------------------------------------------------------------------------------
/models/anydoor/run_dataset_debug.py:
--------------------------------------------------------------------------------
 1 | from datasets.ytb_vos import YoutubeVOSDataset
 2 | from datasets.ytb_vis import YoutubeVISDataset
 3 | from datasets.saliency_modular import SaliencyDataset
 4 | from datasets.vipseg import VIPSegDataset
 5 | from datasets.mvimagenet import MVImageNetDataset
 6 | from datasets.sam import SAMDataset
 7 | from datasets.dreambooth import DreamBoothDataset
 8 | from datasets.uvo import UVODataset
 9 | from datasets.uvo_val import UVOValDataset
10 | from datasets.mose import MoseDataset
11 | from datasets.vitonhd import VitonHDDataset
12 | from datasets.fashiontryon import FashionTryonDataset
13 | from datasets.lvis import LvisDataset
14 | from torch.utils.data import ConcatDataset
15 | from torch.utils.data import DataLoader
16 | import numpy as np 
17 | import cv2
18 | from omegaconf import OmegaConf
19 | 
20 | # Datasets
21 | DConf = OmegaConf.load('./configs/datasets.yaml')
22 | dataset1 = YoutubeVOSDataset(**DConf.Train.YoutubeVOS)  
23 | dataset2 = SaliencyDataset(**DConf.Train.Saliency) 
24 | dataset3 = VIPSegDataset(**DConf.Train.VIPSeg) 
25 | dataset4 = YoutubeVISDataset(**DConf.Train.YoutubeVIS) 
26 | dataset5 = MVImageNetDataset(**DConf.Train.MVImageNet)
27 | dataset6 = SAMDataset(**DConf.Train.SAM)
28 | dataset7 = UVODataset(**DConf.Train.UVO.train)
29 | dataset8 = VitonHDDataset(**DConf.Train.VitonHD)
30 | dataset9 = UVOValDataset(**DConf.Train.UVO.val)
31 | dataset10 = MoseDataset(**DConf.Train.Mose)
32 | dataset11 = FashionTryonDataset(**DConf.Train.FashionTryon)
33 | dataset12 = LvisDataset(**DConf.Train.Lvis)
34 | 
35 | dataset = dataset5
36 | 
37 | 
38 | def vis_sample(item):
39 |     ref = item['ref']* 255
40 |     tar = item['jpg'] * 127.5 + 127.5
41 |     hint = item['hint'] * 127.5 + 127.5
42 |     step = item['time_steps']
43 |     print(ref.shape, tar.shape, hint.shape, step.shape)
44 | 
45 |     ref = ref[0].numpy()
46 |     tar = tar[0].numpy()
47 |     hint_image = hint[0, :,:,:-1].numpy()
48 |     hint_mask = hint[0, :,:,-1].numpy()
49 |     hint_mask = np.stack([hint_mask,hint_mask,hint_mask],-1)
50 |     ref = cv2.resize(ref.astype(np.uint8), (512,512))
51 |     vis = cv2.hconcat([ref.astype(np.float32), hint_image.astype(np.float32), hint_mask.astype(np.float32), tar.astype(np.float32) ])
52 |     cv2.imwrite('sample_vis.jpg',vis[:,:,::-1])
53 | 
54 | 
55 | dataloader = DataLoader(dataset, num_workers=8, batch_size=4, shuffle=True)
56 | print('len dataloader: ', len(dataloader))
57 | for data in dataloader:  
58 |     vis_sample(data) 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/models/anydoor/run_train_anydoor.py:
--------------------------------------------------------------------------------
 1 | import pytorch_lightning as pl
 2 | from torch.utils.data import DataLoader
 3 | from datasets.ytb_vos import YoutubeVOSDataset
 4 | from datasets.ytb_vis import YoutubeVISDataset
 5 | from datasets.saliency_modular import SaliencyDataset
 6 | from datasets.vipseg import VIPSegDataset
 7 | from datasets.mvimagenet import MVImageNetDataset
 8 | from datasets.sam import SAMDataset
 9 | from datasets.uvo import UVODataset
10 | from datasets.uvo_val import UVOValDataset
11 | from datasets.mose import MoseDataset
12 | from datasets.vitonhd import VitonHDDataset
13 | from datasets.fashiontryon import FashionTryonDataset
14 | from datasets.lvis import LvisDataset
15 | from cldm.logger import ImageLogger
16 | from cldm.model import create_model, load_state_dict
17 | from torch.utils.data import ConcatDataset
18 | from cldm.hack import disable_verbosity, enable_sliced_attention
19 | from omegaconf import OmegaConf
20 | 
21 | save_memory = False
22 | disable_verbosity()
23 | if save_memory:
24 |     enable_sliced_attention()
25 | 
26 | # Configs
27 | resume_path = 'path/to/weight'
28 | batch_size = 16
29 | logger_freq = 1000
30 | learning_rate = 1e-5
31 | sd_locked = False
32 | only_mid_control = False
33 | n_gpus = 2
34 | accumulate_grad_batches=1
35 | 
36 | # First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
37 | model = create_model('./configs/anydoor.yaml').cpu()
38 | model.load_state_dict(load_state_dict(resume_path, location='cpu'))
39 | model.learning_rate = learning_rate
40 | model.sd_locked = sd_locked
41 | model.only_mid_control = only_mid_control
42 | 
43 | # Datasets
44 | DConf = OmegaConf.load('./configs/datasets.yaml')
45 | dataset1 = YoutubeVOSDataset(**DConf.Train.YoutubeVOS)  
46 | dataset2 =  SaliencyDataset(**DConf.Train.Saliency) 
47 | dataset3 = VIPSegDataset(**DConf.Train.VIPSeg) 
48 | dataset4 = YoutubeVISDataset(**DConf.Train.YoutubeVIS) 
49 | dataset5 = MVImageNetDataset(**DConf.Train.MVImageNet)
50 | dataset6 = SAMDataset(**DConf.Train.SAM)
51 | dataset7 = UVODataset(**DConf.Train.UVO.train)
52 | dataset8 = VitonHDDataset(**DConf.Train.VitonHD)
53 | dataset9 = UVOValDataset(**DConf.Train.UVO.val)
54 | dataset10 = MoseDataset(**DConf.Train.Mose)
55 | dataset11 = FashionTryonDataset(**DConf.Train.FashionTryon)
56 | dataset12 = LvisDataset(**DConf.Train.Lvis)
57 | 
58 | image_data = [dataset2, dataset6, dataset12]
59 | video_data = [dataset1, dataset3, dataset4, dataset7, dataset9, dataset10 ]
60 | tryon_data = [dataset8, dataset11]
61 | threed_data = [dataset5]
62 | 
63 | # The ratio of each dataset is adjusted by setting the __len__ 
64 | dataset = ConcatDataset( image_data + video_data + tryon_data +  threed_data + video_data + tryon_data +  threed_data  )
65 | dataloader = DataLoader(dataset, num_workers=8, batch_size=batch_size, shuffle=True)
66 | logger = ImageLogger(batch_frequency=logger_freq)
67 | trainer = pl.Trainer(gpus=n_gpus, strategy="ddp", precision=16, accelerator="gpu", callbacks=[logger], progress_bar_refresh_rate=1, accumulate_grad_batches=accumulate_grad_batches)
68 | 
69 | # Train!
70 | trainer.fit(model, dataloader)
71 | 


--------------------------------------------------------------------------------
/models/anydoor/scripts/convert_weight.sh:
--------------------------------------------------------------------------------
1 | python tool_add_control_sd21.py path/v2-1_512-ema-pruned.ckpt path/control_sd21_ini.ckpt


--------------------------------------------------------------------------------
/models/anydoor/scripts/inference.sh:
--------------------------------------------------------------------------------
1 | unset WORLD_SIZE
2 | python run_inference.py


--------------------------------------------------------------------------------
/models/anydoor/scripts/train.sh:
--------------------------------------------------------------------------------
1 | unset WORLD_SIZE
2 | python run_train_anydoor.py


--------------------------------------------------------------------------------
/models/anydoor/tool_add_control_sd21.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | assert len(sys.argv) == 3, 'Args are wrong.'
 5 | 
 6 | input_path = sys.argv[1]
 7 | output_path = sys.argv[2]
 8 | 
 9 | assert os.path.exists(input_path), 'Input model does not exist.'
10 | assert not os.path.exists(output_path), 'Output filename already exists.'
11 | assert os.path.exists(os.path.dirname(output_path)), 'Output path is not valid.'
12 | 
13 | import torch
14 | from share import *
15 | from cldm.model import create_model
16 | 
17 | 
18 | def get_node_name(name, parent_name):
19 |     if len(name) <= len(parent_name):
20 |         return False, ''
21 |     p = name[:len(parent_name)]
22 |     if p != parent_name:
23 |         return False, ''
24 |     return True, name[len(parent_name):]
25 | 
26 | 
27 | model = create_model(config_path='./models/anydoor.yaml')
28 | 
29 | pretrained_weights = torch.load(input_path)
30 | if 'state_dict' in pretrained_weights:
31 |     pretrained_weights = pretrained_weights['state_dict']
32 | 
33 | scratch_dict = model.state_dict()
34 | 
35 | target_dict = {}
36 | for k in scratch_dict.keys():
37 | 
38 |     is_control, name = get_node_name(k, 'control_')
39 |     if 'control_model.input_blocks.0.0' in k:
40 |         print('skipped key: ', k)
41 |         continue
42 | 
43 |     if is_control:
44 |         copy_k = 'model.diffusion_' + name
45 |     else:
46 |         copy_k = k
47 |     if copy_k in pretrained_weights:
48 |         target_dict[k] = pretrained_weights[copy_k].clone()
49 |     else:
50 |         target_dict[k] = scratch_dict[k].clone()
51 |         print(f'These weights are newly added: {k}')
52 | 
53 | model.load_state_dict(target_dict, strict=False)
54 | torch.save(model.state_dict(), output_path)
55 | print('Done.')
56 | 


--------------------------------------------------------------------------------
/models/canny/__pycache__/canny_filter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/canny_filter.cpython-310.pyc


--------------------------------------------------------------------------------
/models/canny/__pycache__/filter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/filter.cpython-310.pyc


--------------------------------------------------------------------------------
/models/canny/__pycache__/gaussian.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/gaussian.cpython-310.pyc


--------------------------------------------------------------------------------
/models/canny/__pycache__/kernels.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/kernels.cpython-310.pyc


--------------------------------------------------------------------------------
/models/canny/__pycache__/sobel.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/canny/__pycache__/sobel.cpython-310.pyc


--------------------------------------------------------------------------------
/models/ctrl_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/ctrl_adapter/__init__.py


--------------------------------------------------------------------------------
/models/depth_completion_net/deformconv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import init as init
 4 | from torch.nn.modules.utils import _pair, _single
 5 | import math
 6 | 
 7 | class ModulatedDeformConv2d(nn.Module):
 8 |     def __init__(self,
 9 |                  in_channels,
10 |                  out_channels,
11 |                  kernel_size,
12 |                  stride=1,
13 |                  padding=0,
14 |                  dilation=1,
15 |                  groups=1,
16 |                  deform_groups=1,
17 |                  bias=True):
18 |         super(ModulatedDeformConv2d, self).__init__()
19 |         
20 |         self.in_channels = in_channels
21 |         self.out_channels = out_channels
22 |         self.kernel_size = _pair(kernel_size)
23 |         self.stride = stride
24 |         self.padding = padding
25 |         self.dilation = dilation
26 |         self.groups = groups
27 |         self.deform_groups = deform_groups
28 |         self.with_bias = bias
29 |         # enable compatibility with nn.Conv2d
30 |         self.transposed = False
31 |         self.output_padding = _single(0)
32 | 
33 |         self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size))
34 |         if bias:
35 |             self.bias = nn.Parameter(torch.Tensor(out_channels))
36 |         else:
37 |             self.register_parameter('bias', None)
38 |         self.init_weights()
39 | 
40 |     def init_weights(self):
41 |         n = self.in_channels
42 |         for k in self.kernel_size:
43 |             n *= k
44 |         stdv = 1. / math.sqrt(n)
45 |         self.weight.data.uniform_(-stdv, stdv)
46 |         if self.bias is not None:
47 |             self.bias.data.zero_()
48 | 
49 |         if hasattr(self, 'conv_offset'):
50 |             self.conv_offset.weight.data.zero_()
51 |             self.conv_offset.bias.data.zero_()
52 | 
53 |     def forward(self, x, offset, mask):
54 |         pass


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/beit.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/beit.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/levit.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/levit.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/swin.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/swin2.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin2.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/swin_common.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/swin_common.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/__pycache__/vit.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/midas/backbones/__pycache__/vit.cpython-310.pyc


--------------------------------------------------------------------------------
/models/midas/backbones/next_vit.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | from pathlib import Path
 6 | from .utils import activations, forward_default, get_activation
 7 | 
 8 | from ..external.next_vit.classification.nextvit import *
 9 | 
10 | 
11 | def forward_next_vit(pretrained, x):
12 |     return forward_default(pretrained, x, "forward")
13 | 
14 | 
15 | def _make_next_vit_backbone(
16 |         model,
17 |         hooks=[2, 6, 36, 39],
18 | ):
19 |     pretrained = nn.Module()
20 | 
21 |     pretrained.model = model
22 |     pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1"))
23 |     pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2"))
24 |     pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3"))
25 |     pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4"))
26 | 
27 |     pretrained.activations = activations
28 | 
29 |     return pretrained
30 | 
31 | 
32 | def _make_pretrained_next_vit_large_6m(hooks=None):
33 |     model = timm.create_model("nextvit_large")
34 | 
35 |     hooks = [2, 6, 36, 39] if hooks == None else hooks
36 |     return _make_next_vit_backbone(
37 |         model,
38 |         hooks=hooks,
39 |     )
40 | 


--------------------------------------------------------------------------------
/models/midas/backbones/swin.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | 
 3 | from .swin_common import _make_swin_backbone
 4 | 
 5 | 
 6 | def _make_pretrained_swinl12_384(pretrained, hooks=None):
 7 |     model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained)
 8 | 
 9 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
10 |     return _make_swin_backbone(
11 |         model,
12 |         hooks=hooks
13 |     )
14 | 


--------------------------------------------------------------------------------
/models/midas/backbones/swin2.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | 
 3 | from .swin_common import _make_swin_backbone
 4 | 
 5 | 
 6 | def _make_pretrained_swin2l24_384(pretrained, hooks=None):
 7 |     model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained)
 8 | 
 9 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
10 |     return _make_swin_backbone(
11 |         model,
12 |         hooks=hooks
13 |     )
14 | 
15 | 
16 | def _make_pretrained_swin2b24_384(pretrained, hooks=None):
17 |     model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained)
18 | 
19 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
20 |     return _make_swin_backbone(
21 |         model,
22 |         hooks=hooks
23 |     )
24 | 
25 | 
26 | def _make_pretrained_swin2t16_256(pretrained, hooks=None):
27 |     model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained)
28 | 
29 |     hooks = [1, 1, 5, 1] if hooks == None else hooks
30 |     return _make_swin_backbone(
31 |         model,
32 |         hooks=hooks,
33 |         patch_grid=[64, 64]
34 |     )
35 | 


--------------------------------------------------------------------------------
/models/midas/backbones/swin_common.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | from .utils import activations, forward_default, get_activation, Transpose
 7 | 
 8 | 
 9 | def forward_swin(pretrained, x):
10 |     return forward_default(pretrained, x)
11 | 
12 | 
13 | def _make_swin_backbone(
14 |         model,
15 |         hooks=[1, 1, 17, 1],
16 |         patch_grid=[96, 96]
17 | ):
18 |     pretrained = nn.Module()
19 | 
20 |     pretrained.model = model
21 |     pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1"))
22 |     pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2"))
23 |     pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3"))
24 |     pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4"))
25 | 
26 |     pretrained.activations = activations
27 | 
28 |     if hasattr(model, "patch_grid"):
29 |         used_patch_grid = model.patch_grid
30 |     else:
31 |         used_patch_grid = patch_grid
32 | 
33 |     patch_grid_size = np.array(used_patch_grid, dtype=int)
34 | 
35 |     pretrained.act_postprocess1 = nn.Sequential(
36 |         Transpose(1, 2),
37 |         nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
38 |     )
39 |     pretrained.act_postprocess2 = nn.Sequential(
40 |         Transpose(1, 2),
41 |         nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist()))
42 |     )
43 |     pretrained.act_postprocess3 = nn.Sequential(
44 |         Transpose(1, 2),
45 |         nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist()))
46 |     )
47 |     pretrained.act_postprocess4 = nn.Sequential(
48 |         Transpose(1, 2),
49 |         nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist()))
50 |     )
51 | 
52 |     return pretrained
53 | 


--------------------------------------------------------------------------------
/models/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/models/midas/midas_net.py:
--------------------------------------------------------------------------------
 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
 2 | This file contains code that is adapted from
 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from .base_model import BaseModel
 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10 | 
11 | 
12 | class MidasNet(BaseModel):
13 |     """Network for monocular depth estimation.
14 |     """
15 | 
16 |     def __init__(self, path=None, features=256, non_negative=True):
17 |         """Init.
18 | 
19 |         Args:
20 |             path (str, optional): Path to saved model. Defaults to None.
21 |             features (int, optional): Number of features. Defaults to 256.
22 |             backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23 |         """
24 |         print("Loading weights: ", path)
25 | 
26 |         super(MidasNet, self).__init__()
27 | 
28 |         use_pretrained = False if path is None else True
29 | 
30 |         self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31 | 
32 |         self.scratch.refinenet4 = FeatureFusionBlock(features)
33 |         self.scratch.refinenet3 = FeatureFusionBlock(features)
34 |         self.scratch.refinenet2 = FeatureFusionBlock(features)
35 |         self.scratch.refinenet1 = FeatureFusionBlock(features)
36 | 
37 |         self.scratch.output_conv = nn.Sequential(
38 |             nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39 |             Interpolate(scale_factor=2, mode="bilinear"),
40 |             nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41 |             nn.ReLU(True),
42 |             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43 |             nn.ReLU(True) if non_negative else nn.Identity(),
44 |         )
45 | 
46 |         if path:
47 |             self.load(path)
48 | 
49 |     def forward(self, x):
50 |         """Forward pass.
51 | 
52 |         Args:
53 |             x (tensor): input data (image)
54 | 
55 |         Returns:
56 |             tensor: depth
57 |         """
58 | 
59 |         layer_1 = self.pretrained.layer1(x)
60 |         layer_2 = self.pretrained.layer2(layer_1)
61 |         layer_3 = self.pretrained.layer3(layer_2)
62 |         layer_4 = self.pretrained.layer4(layer_3)
63 | 
64 |         layer_1_rn = self.scratch.layer1_rn(layer_1)
65 |         layer_2_rn = self.scratch.layer2_rn(layer_2)
66 |         layer_3_rn = self.scratch.layer3_rn(layer_3)
67 |         layer_4_rn = self.scratch.layer4_rn(layer_4)
68 | 
69 |         path_4 = self.scratch.refinenet4(layer_4_rn)
70 |         path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71 |         path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72 |         path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73 | 
74 |         out = self.scratch.output_conv(path_1)
75 | 
76 |         return torch.squeeze(out, dim=1)
77 | 


--------------------------------------------------------------------------------
/models/raft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/__init__.py


--------------------------------------------------------------------------------
/models/raft/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .flow_viz import flow_to_image
2 | from .frame_utils import writeFlow
3 | 


--------------------------------------------------------------------------------
/models/raft/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/models/raft/utils/__pycache__/flow_viz.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/flow_viz.cpython-310.pyc


--------------------------------------------------------------------------------
/models/raft/utils/__pycache__/frame_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/frame_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/models/raft/utils/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/raft/utils/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/models/raft/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from scipy import interpolate
 5 | 
 6 | 
 7 | class InputPadder:
 8 |     """ Pads images such that dimensions are divisible by 8 """
 9 |     def __init__(self, dims, mode='sintel'):
10 |         self.ht, self.wd = dims[-2:]
11 |         pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
12 |         pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
13 |         if mode == 'sintel':
14 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
15 |         else:
16 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
17 | 
18 |     def pad(self, *inputs):
19 |         return [F.pad(x, self._pad, mode='replicate') for x in inputs]
20 | 
21 |     def unpad(self,x):
22 |         ht, wd = x.shape[-2:]
23 |         c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
24 |         return x[..., c[0]:c[1], c[2]:c[3]]
25 | 
26 | def forward_interpolate(flow):
27 |     flow = flow.detach().cpu().numpy()
28 |     dx, dy = flow[0], flow[1]
29 | 
30 |     ht, wd = dx.shape
31 |     x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
32 | 
33 |     x1 = x0 + dx
34 |     y1 = y0 + dy
35 |     
36 |     x1 = x1.reshape(-1)
37 |     y1 = y1.reshape(-1)
38 |     dx = dx.reshape(-1)
39 |     dy = dy.reshape(-1)
40 | 
41 |     valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
42 |     x1 = x1[valid]
43 |     y1 = y1[valid]
44 |     dx = dx[valid]
45 |     dy = dy[valid]
46 | 
47 |     flow_x = interpolate.griddata(
48 |         (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)
49 | 
50 |     flow_y = interpolate.griddata(
51 |         (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)
52 | 
53 |     flow = np.stack([flow_x, flow_y], axis=0)
54 |     return torch.from_numpy(flow).float()
55 | 
56 | 
57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False):
58 |     """ Wrapper for grid_sample, uses pixel coordinates """
59 |     H, W = img.shape[-2:]
60 |     xgrid, ygrid = coords.split([1,1], dim=-1)
61 |     xgrid = 2*xgrid/(W-1) - 1
62 |     ygrid = 2*ygrid/(H-1) - 1
63 | 
64 |     grid = torch.cat([xgrid, ygrid], dim=-1)
65 |     img = F.grid_sample(img, grid, align_corners=True)
66 | 
67 |     if mask:
68 |         mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
69 |         return img, mask.float()
70 | 
71 |     return img
72 | 
73 | 
74 | def coords_grid(batch, ht, wd):
75 |     coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
76 |     coords = torch.stack(coords[::-1], dim=0).float()
77 |     return coords[None].repeat(batch, 1, 1, 1)
78 | 
79 | 
80 | def upflow8(flow, mode='bilinear'):
81 |     new_size = (8 * flow.shape[2], 8 * flow.shape[3])
82 |     return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
83 | 


--------------------------------------------------------------------------------
/models/u2net/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/models/u2net/__init__.py


--------------------------------------------------------------------------------
/runners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/__init__.py


--------------------------------------------------------------------------------
/runners/instructpix2pix_inference_runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | import torch
 4 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 5 | 
 6 | 
 7 | def instructpix2pix_inference_runner(args):
 8 |     device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 9 | 
10 |     # Weight dtype
11 |     weight_dtype = torch.float32
12 |     if args.mixed_precision == "fp16":
13 |         weight_dtype = torch.float16
14 |     elif args.mixed_precision == "bf16":
15 |         weight_dtype = torch.bfloat16
16 | 
17 |     # Define pipeline
18 |     pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
19 |         args.instructpix2pix_checkpoint_path,
20 |         torch_dtype=weight_dtype,
21 |         safety_checker=None)
22 |     pipeline.to(device)
23 |     pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
24 | 
25 |     # Load the first-frame input
26 |     input_list = sorted(os.listdir(args.source_video_frames))
27 |     image = Image.open(os.path.join(args.source_video_frames, input_list[0]))
28 |     image = image.resize((args.height, args.width))
29 | 
30 |     # Forward
31 |     images = pipeline(args.external_guidance,
32 |                       image=image,
33 |                       seed=args.seed,
34 |                       guidance_scale=args.guidance_scale,
35 |                       negative_prompt=args.negative_prompt,
36 |                       num_inference_steps=args.num_inference_steps,
37 |                       image_guidance_scale=args.image_guidance_scale).images[0]
38 |     
39 |     # Save image
40 |     save_path = os.path.join(args.outdir, 'image_editing_results')
41 |     os.makedirs(save_path, exist_ok=True)
42 |     filename = args.prompt.lower().replace('.', '').replace(' ', '_')
43 |     images.save(os.path.join(save_path, f'{filename}.png'))


--------------------------------------------------------------------------------
/runners/iterative_warping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__init__.py


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/get_averaged_depths.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/get_averaged_depths.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/run_flow_extraction.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_flow_extraction.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/run_torch_average_flow_warping.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_torch_average_flow_warping.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/run_warp_with_averaged_flow.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/run_warp_with_averaged_flow.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/__pycache__/warp_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/__pycache__/warp_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/get_averaged_depths.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import torch
 4 | import os
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | def load_image(path):
 9 |     img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
10 |     return torch.from_numpy(img.astype(np.float32) / 255.).unsqueeze(0).unsqueeze(0)
11 | 
12 | def get_averaged_depths_main_func(args):
13 |     depth_dir = os.path.join(args.outdir, 'iterative_warping', 'depth_maps')
14 |     output_dir = os.path.join(args.outdir, 'iterative_warping', 'averaged_depths')
15 |     object_mask_dir = os.path.join(args.outdir, 'iterative_warping', 'object_masks')
16 |     editing_mask_dir = os.path.join(args.outdir, 'iterative_warping', 'warped_masks')
17 |     os.makedirs(output_dir, exist_ok=True)
18 | 
19 |     # Get sorted lists of all files in each input directory
20 |     depth_files = sorted([f for f in os.listdir(depth_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames]
21 |     object_mask_files = sorted([f for f in os.listdir(object_mask_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames]
22 |     editing_mask_files = sorted([f for f in os.listdir(editing_mask_dir) if f.endswith('.png') or f.endswith('.jpg')])[:args.n_sample_frames]
23 | 
24 |     for i, (depth_file, object_mask_file, editing_mask_file) in enumerate(tqdm(zip(depth_files, object_mask_files, editing_mask_files), total=len(depth_files))):
25 |         # 1. Load depth map
26 |         depth_map = load_image(os.path.join(depth_dir, depth_file))
27 |         _, _, H, W = depth_map.shape
28 |         
29 |         # 2. Load object mask
30 |         object_mask = load_image(os.path.join(object_mask_dir, object_mask_file))
31 |         object_mask = cv2.resize(object_mask.squeeze().numpy(), (W, H))
32 |         object_mask = torch.from_numpy(object_mask).unsqueeze(0).unsqueeze(0)
33 |         object_mask[object_mask > 0.5] = 1
34 |         object_mask[object_mask <= 0.5] = 0
35 | 
36 |         # 3. Load editing mask
37 |         editing_mask = load_image(os.path.join(editing_mask_dir, editing_mask_file))
38 |         editing_mask = cv2.resize(editing_mask.squeeze().numpy(), (W, H))
39 |         editing_mask = torch.from_numpy(editing_mask).unsqueeze(0).unsqueeze(0)
40 |         editing_mask[editing_mask > 0.5] = 1
41 |         editing_mask[editing_mask <= 0.5] = 0
42 | 
43 |         # 4. Compute average depth within the object mask
44 |         object_masked_depth = object_mask * depth_map
45 |         total_depth = object_masked_depth.sum()
46 |         num_pixels = object_mask.sum()
47 |         average_depth = total_depth / num_pixels
48 |         
49 |         # 5. Apply average depth to depths within the editing mask
50 |         averaged_depth_map = torch.where(editing_mask == 1, average_depth, depth_map)
51 | 
52 |         # 6. Save the result
53 |         output_depth = (averaged_depth_map.squeeze().cpu().numpy() * 255).astype(np.uint8)
54 |         output_filename = f'{i:05d}.png'
55 |         cv2.imwrite(os.path.join(output_dir, output_filename), output_depth)
56 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/get_editing_region.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import os
 4 | 
 5 | def get_editing_region(src_mask, approximate_mask):
 6 |     src_mask = src_mask / 255.
 7 |     approximate_mask = approximate_mask / 255.
 8 |     editing_region = src_mask * (1 - approximate_mask)
 9 |     return (editing_region * 255.).astype('uint8')
10 | 
11 | def main(args):
12 |     # Create output directory if it doesn't exist
13 |     os.makedirs(args.output_dir, exist_ok=True)
14 | 
15 |     # Get list of all files in the source mask directory
16 |     src_mask_files = sorted([f for f in os.listdir(args.src_mask_dir) if f.endswith('.png')])
17 |     approx_mask_files = sorted([f for f in os.listdir(args.approx_mask_dir) if f.endswith('.png')])
18 | 
19 |     for count, (src_filename, approx_filename) in enumerate(zip(src_mask_files, approx_mask_files)):
20 |         # Load source mask
21 |         src_mask_path = os.path.join(args.src_mask_dir, src_filename)
22 |         src_mask = cv2.imread(src_mask_path)
23 |         
24 |         # Load approximate mask
25 |         approx_mask_path = os.path.join(args.approx_mask_dir, approx_filename)
26 |         approximate_mask = cv2.imread(approx_mask_path)
27 | 
28 |         # Resize masks to the same size
29 |         if src_mask.shape != approximate_mask.shape:
30 |             height, width = src_mask.shape[:2]
31 |             approximate_mask = cv2.resize(approximate_mask, (width, height), interpolation=cv2.INTER_NEAREST)
32 | 
33 |         # Get editing region
34 |         editing_region = get_editing_region(src_mask, approximate_mask)
35 | 
36 |         # Save editing region mask
37 |         output_path = os.path.join(args.output_dir, f'{count:05d}.png')
38 |         cv2.imwrite(output_path, editing_region)
39 | 
40 |         print(f"Progress: {count + 1}")
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser(description="Generate editing region masks for video frames")
44 |     parser.add_argument("--src-mask-dir", type=str, required=True, help="Directory containing source mask frames")
45 |     parser.add_argument("--approx-mask-dir", type=str, required=True, help="Directory containing approximate mask frames")
46 |     parser.add_argument("--output-dir", type=str, default="editing_regions", help="Output directory for editing region masks")
47 |     
48 |     args = parser.parse_args()
49 |     main(args)


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, princeton-vl
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/README.md:
--------------------------------------------------------------------------------
 1 | # RAFT
 2 | This repository contains the source code for our paper:
 3 | 
 4 | [RAFT: Recurrent All Pairs Field Transforms for Optical Flow](https://arxiv.org/pdf/2003.12039.pdf)<br/>
 5 | ECCV 2020 <br/>
 6 | Zachary Teed and Jia Deng<br/>
 7 | 
 8 | <img src="RAFT.png">
 9 | 
10 | ## Requirements
11 | The code has been tested with PyTorch 1.6 and Cuda 10.1.
12 | ```Shell
13 | conda create --name raft
14 | conda activate raft
15 | conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 matplotlib tensorboard scipy opencv -c pytorch
16 | ```
17 | 
18 | ## Demos
19 | Pretrained models can be downloaded by running
20 | ```Shell
21 | ./download_models.sh
22 | ```
23 | or downloaded from [google drive](https://drive.google.com/drive/folders/1sWDsfuZ3Up38EUQt7-JDTT1HcGHuJgvT?usp=sharing)
24 | 
25 | You can demo a trained model on a sequence of frames
26 | ```Shell
27 | python demo.py --model=models/raft-things.pth --path=demo-frames
28 | ```
29 | 
30 | ## Required Data
31 | To evaluate/train RAFT, you will need to download the required datasets. 
32 | * [FlyingChairs](https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html#flyingchairs)
33 | * [FlyingThings3D](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html)
34 | * [Sintel](http://sintel.is.tue.mpg.de/)
35 | * [KITTI](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=flow)
36 | * [HD1K](http://hci-benchmark.iwr.uni-heidelberg.de/) (optional)
37 | 
38 | 
39 | By default `datasets.py` will search for the datasets in these locations. You can create symbolic links to wherever the datasets were downloaded in the `datasets` folder
40 | 
41 | ```Shell
42 | ├── datasets
43 |     ├── Sintel
44 |         ├── test
45 |         ├── training
46 |     ├── KITTI
47 |         ├── testing
48 |         ├── training
49 |         ├── devkit
50 |     ├── FlyingChairs_release
51 |         ├── data
52 |     ├── FlyingThings3D
53 |         ├── frames_cleanpass
54 |         ├── frames_finalpass
55 |         ├── optical_flow
56 | ```
57 | 
58 | ## Evaluation
59 | You can evaluate a trained model using `evaluate.py`
60 | ```Shell
61 | python evaluate.py --model=models/raft-things.pth --dataset=sintel --mixed_precision
62 | ```
63 | 
64 | ## Training
65 | We used the following training schedule in our paper (2 GPUs). Training logs will be written to the `runs` which can be visualized using tensorboard
66 | ```Shell
67 | ./train_standard.sh
68 | ```
69 | 
70 | If you have a RTX GPU, training can be accelerated using mixed precision. You can expect similiar results in this setting (1 GPU)
71 | ```Shell
72 | ./train_mixed.sh
73 | ```
74 | 
75 | ## (Optional) Efficent Implementation
76 | You can optionally use our alternate (efficent) implementation by compiling the provided cuda extension
77 | ```Shell
78 | cd alt_cuda_corr && python setup.py install && cd ..
79 | ```
80 | and running `demo.py` and `evaluate.py` with the `--alternate_corr` flag Note, this implementation is somewhat slower than all-pairs, but uses significantly less GPU memory during the forward pass.
81 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/alt_cuda_corr/correlation.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <vector>
 3 | 
 4 | // CUDA forward declarations
 5 | std::vector<torch::Tensor> corr_cuda_forward(
 6 |     torch::Tensor fmap1,
 7 |     torch::Tensor fmap2,
 8 |     torch::Tensor coords,
 9 |     int radius);
10 | 
11 | std::vector<torch::Tensor> corr_cuda_backward(
12 |   torch::Tensor fmap1,
13 |   torch::Tensor fmap2,
14 |   torch::Tensor coords,
15 |   torch::Tensor corr_grad,
16 |   int radius);
17 | 
18 | // C++ interface
19 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
20 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
21 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
22 | 
23 | std::vector<torch::Tensor> corr_forward(
24 |     torch::Tensor fmap1,
25 |     torch::Tensor fmap2,
26 |     torch::Tensor coords,
27 |     int radius) {
28 |   CHECK_INPUT(fmap1);
29 |   CHECK_INPUT(fmap2);
30 |   CHECK_INPUT(coords);
31 | 
32 |   return corr_cuda_forward(fmap1, fmap2, coords, radius);
33 | }
34 | 
35 | 
36 | std::vector<torch::Tensor> corr_backward(
37 |     torch::Tensor fmap1,
38 |     torch::Tensor fmap2,
39 |     torch::Tensor coords,
40 |     torch::Tensor corr_grad,
41 |     int radius) {
42 |   CHECK_INPUT(fmap1);
43 |   CHECK_INPUT(fmap2);
44 |   CHECK_INPUT(coords);
45 |   CHECK_INPUT(corr_grad);
46 | 
47 |   return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius);
48 | }
49 | 
50 | 
51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
52 |   m.def("forward", &corr_forward, "CORR forward");
53 |   m.def("backward", &corr_backward, "CORR backward");
54 | }


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/alt_cuda_corr/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | 
 5 | setup(
 6 |     name='correlation',
 7 |     ext_modules=[
 8 |         CUDAExtension('alt_cuda_corr',
 9 |             sources=['correlation.cpp', 'correlation_kernel.cu'],
10 |             extra_compile_args={'cxx': [], 'nvcc': ['-O3']}),
11 |     ],
12 |     cmdclass={
13 |         'build_ext': BuildExtension
14 |     })
15 | 
16 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__init__.py


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__pycache__/corr.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/corr.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__pycache__/extractor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/extractor.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__pycache__/raft.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/raft.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/__pycache__/update.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/__pycache__/update.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__init__.py


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/utils/__pycache__/flow_viz.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/flow_viz.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/utils/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/runners/iterative_warping/raft/core/utils/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/core/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from scipy import interpolate
 5 | 
 6 | 
 7 | class InputPadder:
 8 |     """ Pads images such that dimensions are divisible by 8 """
 9 |     def __init__(self, dims, mode='sintel'):
10 |         self.ht, self.wd = dims[-2:]
11 |         pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
12 |         pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
13 |         if mode == 'sintel':
14 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
15 |         else:
16 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
17 | 
18 |     def pad(self, *inputs):
19 |         return [F.pad(x, self._pad, mode='replicate') for x in inputs]
20 | 
21 |     def unpad(self,x):
22 |         ht, wd = x.shape[-2:]
23 |         c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
24 |         return x[..., c[0]:c[1], c[2]:c[3]]
25 | 
26 | def forward_interpolate(flow):
27 |     flow = flow.detach().cpu().numpy()
28 |     dx, dy = flow[0], flow[1]
29 | 
30 |     ht, wd = dx.shape
31 |     x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
32 | 
33 |     x1 = x0 + dx
34 |     y1 = y0 + dy
35 |     
36 |     x1 = x1.reshape(-1)
37 |     y1 = y1.reshape(-1)
38 |     dx = dx.reshape(-1)
39 |     dy = dy.reshape(-1)
40 | 
41 |     valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
42 |     x1 = x1[valid]
43 |     y1 = y1[valid]
44 |     dx = dx[valid]
45 |     dy = dy[valid]
46 | 
47 |     flow_x = interpolate.griddata(
48 |         (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)
49 | 
50 |     flow_y = interpolate.griddata(
51 |         (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)
52 | 
53 |     flow = np.stack([flow_x, flow_y], axis=0)
54 |     return torch.from_numpy(flow).float()
55 | 
56 | 
57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False):
58 |     """ Wrapper for grid_sample, uses pixel coordinates """
59 |     H, W = img.shape[-2:]
60 |     xgrid, ygrid = coords.split([1,1], dim=-1)
61 |     xgrid = 2*xgrid/(W-1) - 1
62 |     ygrid = 2*ygrid/(H-1) - 1
63 | 
64 |     grid = torch.cat([xgrid, ygrid], dim=-1)
65 |     img = F.grid_sample(img, grid, align_corners=True)
66 | 
67 |     if mask:
68 |         mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
69 |         return img, mask.float()
70 | 
71 |     return img
72 | 
73 | 
74 | def coords_grid(batch, ht, wd, device):
75 |     coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
76 |     coords = torch.stack(coords[::-1], dim=0).float()
77 |     return coords[None].repeat(batch, 1, 1, 1)
78 | 
79 | 
80 | def upflow8(flow, mode='bilinear'):
81 |     new_size = (8 * flow.shape[2], 8 * flow.shape[3])
82 |     return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
83 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/demo.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('core')
 3 | 
 4 | import argparse
 5 | import os
 6 | import cv2
 7 | import glob
 8 | import numpy as np
 9 | import torch
10 | from PIL import Image
11 | 
12 | from raft import RAFT
13 | from utils import flow_viz
14 | from utils.utils import InputPadder
15 | 
16 | 
17 | 
18 | DEVICE = 'cpu'
19 | 
20 | def load_image(imfile):
21 |     img = np.array(Image.open(imfile)).astype(np.uint8)
22 |     img = torch.from_numpy(img).permute(2, 0, 1).float()
23 |     return img[None].to(DEVICE)
24 | 
25 | 
26 | def viz(img, flo, count):
27 |     img = img[0].permute(1,2,0).cpu().numpy()
28 |     flo = flo[0].permute(1,2,0).cpu().numpy()
29 |     
30 |     # map flow to rgb image
31 |     flo = flow_viz.flow_to_image(flo)
32 |     img_flo = np.concatenate([img, flo], axis=0)
33 | 
34 |     # import matplotlib.pyplot as plt
35 |     # plt.imshow(img_flo / 255.0)
36 |     # plt.show()
37 | 
38 |     cv2.imwrite(f'outputs/visualization/{count:05d}.png', img_flo[:, :, [2,1,0]]) # /255.0
39 |   
40 | 
41 | def demo(args):
42 |     model = torch.nn.DataParallel(RAFT(args))
43 |     model.load_state_dict(torch.load(args.model, map_location='cpu'))
44 | 
45 |     model = model.module
46 |     model.to(DEVICE)
47 |     model.eval()
48 | 
49 |     with torch.no_grad():
50 |         images = glob.glob(os.path.join(args.W, '*.png')) + \
51 |                  glob.glob(os.path.join(args.path, '*.jpg'))
52 |         
53 |         images = sorted(images)
54 |         count = 0
55 |         for imfile1, imfile2 in zip(images[:-1], images[1:]):
56 |             image1 = load_image(imfile1)
57 |             image2 = load_image(imfile2)
58 | 
59 |             padder = InputPadder(image1.shape)
60 |             image1, image2 = padder.pad(image1, image2)
61 | 
62 |             flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
63 |             np.save(os.path.join('outputs/optical-flow-up', f'{count:05d}'), flow_up.cpu())
64 |             np.save(os.path.join('outputs/optical-flow-low', f'{count:05d}'), flow_low.cpu())
65 |             viz(image1, flow_up, count)
66 |             count += 1
67 |             print(f'Progress: {count}/{len(images)}')
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument('--model', help="restore checkpoint")
73 |     parser.add_argument('--path', help="dataset for evaluation")
74 |     parser.add_argument('--small', action='store_true', help='use small model')
75 |     parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
76 |     parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
77 |     args = parser.parse_args()
78 | 
79 |     os.makedirs('outputs/visualization', exist_ok=True)
80 |     os.makedirs('outputs/optical-flow-up', exist_ok=True)
81 |     os.makedirs('outputs/optical-flow-low', exist_ok=True)
82 |     demo(args)
83 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/download_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip
3 | unzip models.zip
4 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/extract-flow-from-frames.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('core')
 3 | 
 4 | import argparse
 5 | import os
 6 | import cv2
 7 | import glob
 8 | import numpy as np
 9 | import torch
10 | from PIL import Image
11 | 
12 | from raft import RAFT
13 | from utils import flow_viz
14 | from utils.utils import InputPadder
15 | 
16 | 
17 | 
18 | DEVICE = 'cpu'
19 | 
20 | def load_image(imfile):
21 |     img = np.array(Image.open(imfile)).astype(np.uint8)
22 |     img = torch.from_numpy(img).permute(2, 0, 1).float()
23 |     return img[None].to(DEVICE)
24 | 
25 | 
26 | def viz(img, flo, outdir, index):
27 |     img = img[0].permute(1,2,0).cpu().numpy()
28 |     flo = flo[0].permute(1,2,0).cpu().numpy()
29 |     
30 |     # map flow to rgb image
31 |     flo = flow_viz.flow_to_image(flo)
32 |     img_flo = np.concatenate([img, flo], axis=0)
33 | 
34 |     # import matplotlib.pyplot as plt
35 |     # plt.imshow(img_flo / 255.0)
36 |     # plt.show()
37 | 
38 |     cv2.imwrite(os.path.join(outdir, 'visualization', f'{index:05d}.png'), img_flo[:, :, [2,1,0]]) # /255.0
39 |   
40 | 
41 | def demo(args):
42 |     # 0. Define RAFT model
43 |     model = torch.nn.DataParallel(RAFT(args))
44 |     model.load_state_dict(torch.load(args.model, map_location='cpu'))
45 | 
46 |     model = model.module
47 |     model.to(DEVICE)
48 |     model.eval()
49 | 
50 |     # 1. Load in frames path
51 |     frames_path = []
52 |     for path in sorted(os.listdir(args.path)):
53 |         frames_path.append(os.path.join(args.path, path))
54 |     
55 | 
56 |     # 2. Start extracting optical flows
57 |     with torch.no_grad():
58 |         for index in range(len(frames_path)):
59 |             if index + 1 < len(frames_path):
60 |                 image1 = load_image(frames_path[index + 1])
61 |                 image2 = load_image(frames_path[index])
62 | 
63 |                 padder = InputPadder(image1.shape)
64 |                 image1, image2 = padder.pad(image1, image2)
65 | 
66 |                 flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
67 |                 np.save(os.path.join(args.outdir, 'flow-up', f'{index:05d}'), flow_up.cpu())
68 |                 np.save(os.path.join(args.outdir, 'flow-low', f'{index:05d}'), flow_low.cpu())
69 |                 viz(image1, flow_up, args.outdir, index)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument('--model', default='/Users/liuchang/Desktop/Workspaces/checkpoints/raft/raft-things.pth', help="restore checkpoint")
75 |     parser.add_argument('--path', type=str, help='path of video frames')
76 |     parser.add_argument('--outdir', type=str, default='outputs', help='output directory')
77 |     parser.add_argument('--small', action='store_true', help='use small model')
78 |     parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
79 |     parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
80 |     args = parser.parse_args()
81 | 
82 |     os.makedirs(args.outdir, exist_ok=True)
83 |     os.makedirs(os.path.join(args.outdir, 'visualization'), exist_ok=True)
84 |     os.makedirs(os.path.join(args.outdir, 'flow-up'), exist_ok=True)
85 |     os.makedirs(os.path.join(args.outdir, 'flow-low'), exist_ok=True)
86 |     demo(args)
87 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/extract-flow-from-two-images.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('core')
 3 | 
 4 | import argparse
 5 | import os
 6 | import cv2
 7 | import glob
 8 | import numpy as np
 9 | import torch
10 | from PIL import Image
11 | 
12 | from raft import RAFT
13 | from utils import flow_viz
14 | from utils.utils import InputPadder
15 | 
16 | 
17 | 
18 | DEVICE = 'cpu'
19 | 
20 | def load_image(imfile):
21 |     img = np.array(Image.open(imfile)).astype(np.uint8)
22 |     img = torch.from_numpy(img).permute(2, 0, 1).float()
23 |     return img[None].to(DEVICE)
24 | 
25 | 
26 | def viz(img, flo):
27 |     img = img[0].permute(1,2,0).cpu().numpy()
28 |     flo = flo[0].permute(1,2,0).cpu().numpy()
29 |     
30 |     # map flow to rgb image
31 |     flo = flow_viz.flow_to_image(flo)
32 |     img_flo = np.concatenate([img, flo], axis=0)
33 | 
34 |     # import matplotlib.pyplot as plt
35 |     # plt.imshow(img_flo / 255.0)
36 |     # plt.show()
37 | 
38 |     cv2.imwrite(f'outputs/visualization.png', img_flo[:, :, [2,1,0]]) # /255.0
39 |   
40 | 
41 | def demo(args):
42 |     model = torch.nn.DataParallel(RAFT(args))
43 |     model.load_state_dict(torch.load(args.model, map_location='cpu'))
44 | 
45 |     model = model.module
46 |     model.to(DEVICE)
47 |     model.eval()
48 | 
49 |     with torch.no_grad():
50 |         image1 = load_image(args.image1)
51 |         image2 = load_image(args.image2)
52 | 
53 |         padder = InputPadder(image1.shape)
54 |         image1, image2 = padder.pad(image1, image2)
55 | 
56 |         flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
57 |         np.save(os.path.join('outputs', 'flow-up'), flow_up.cpu())
58 |         np.save(os.path.join('outputs', 'flow-low'), flow_low.cpu())
59 |         viz(image1, flow_up)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument('--model', default='/Users/liuchang/Desktop/Workspaces/checkpoints/raft/raft-things.pth', help="restore checkpoint")
65 |     parser.add_argument('--image1', type=str, help='the first image')
66 |     parser.add_argument('--image2', type=str, help='the second image')
67 |     parser.add_argument('--small', action='store_true', help='use small model')
68 |     parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
69 |     parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
70 |     args = parser.parse_args()
71 | 
72 |     os.makedirs('outputs', exist_ok=True)
73 |     demo(args)
74 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/train_mixed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p checkpoints
3 | python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 --num_steps 120000 --batch_size 8 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --mixed_precision 
4 | python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 400 720 --wdecay 0.0001 --mixed_precision
5 | python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 368 768 --wdecay 0.00001 --gamma=0.85 --mixed_precision
6 | python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 --num_steps 50000 --batch_size 5 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85 --mixed_precision
7 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/raft/train_standard.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p checkpoints
3 | python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 1 --num_steps 100000 --batch_size 10 --lr 0.0004 --image_size 368 496 --wdecay 0.0001
4 | python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 400 720 --wdecay 0.0001
5 | python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma=0.85
6 | python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 1 --num_steps 50000 --batch_size 6 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85
7 | 


--------------------------------------------------------------------------------
/runners/iterative_warping/run_extract_images_depths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from PIL import Image
 4 | from tqdm import tqdm
 5 | from models.midas.midas import DepthMidas
 6 | import torch
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser(description="Extract depth maps from images")
10 |     parser.add_argument("--device", default='cuda' if torch.cuda.is_available() else 'cpu',
11 |                         help="Device to use for computation")
12 |     parser.add_argument("--midas_path", default='/Users/liuchang/Desktop/Workspaces/checkpoints/dpt_swin2_large_384.pt',
13 |                         help="Path to MiDaS model")
14 |     parser.add_argument("--input_dir", default='inpainted_outputs',
15 |                         help="Directory containing input images")
16 |     parser.add_argument("--output_dir", default='experimental_scripts/output_depth_examples',
17 |                         help="Directory to save output depth maps")
18 |     return parser.parse_args()
19 | 
20 | def main(args):
21 |     os.makedirs(args.output_dir, exist_ok=True)
22 | 
23 |     depth_estimator = DepthMidas(model_path=args.midas_path, device=args.device)
24 | 
25 |     # Get all image files
26 |     image_files = [f for f in os.listdir(args.input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
27 | 
28 |     progress_bar = tqdm(total=len(image_files))
29 |     for image_file in image_files:
30 |         progress_bar.update(1)
31 |         
32 |         # Load image
33 |         image_path = os.path.join(args.input_dir, image_file)
34 |         image = Image.open(image_path).convert('RGB')
35 |         
36 |         # Estimate depth
37 |         depth = depth_estimator.estimate([image])[0]
38 |         
39 |         # Save depth map
40 |         output_path = os.path.join(args.output_dir, f"{image_file}")
41 |         depth.save(output_path)
42 | 
43 |     progress_bar.close()
44 |     print("All images processed.")
45 | 
46 | if __name__ == "__main__":
47 |     args = parse_args()
48 |     main(args)


--------------------------------------------------------------------------------
/runners/iterative_warping/run_flow_extraction.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('core')
 3 | 
 4 | import os
 5 | import cv2
 6 | import numpy as np
 7 | import torch
 8 | from PIL import Image
 9 | 
10 | from models.raft.raft import RAFT
11 | from runners.iterative_warping.raft.core.utils.flow_viz import flow_to_image
12 | from runners.iterative_warping.raft.core.utils.utils import InputPadder
13 | 
14 | 
15 | 
16 | device = "cuda" if torch.cuda.is_available() else "cpu"
17 | 
18 | def load_image(imfile):
19 |     img = np.array(Image.open(imfile).convert('RGB')).astype(np.uint8)
20 |     img = cv2.resize(img, (512, 512))
21 |     img = torch.from_numpy(img).permute(2, 0, 1).float()
22 |     return img[None].to(device)
23 | 
24 | 
25 | def viz(img, flo, outdir, index):
26 |     img = img[0].permute(1,2,0).cpu().numpy()
27 |     flo = flo[0].permute(1,2,0).cpu().numpy()
28 |     
29 |     # map flow to rgb image
30 |     flo = flow_to_image(flo)
31 |     img_flo = np.concatenate([img, flo], axis=0)
32 | 
33 |     # import matplotlib.pyplot as plt
34 |     # plt.imshow(img_flo / 255.0)
35 |     # plt.show()
36 | 
37 |     cv2.imwrite(os.path.join(outdir, 'visualization', f'{index:05d}.png'), img_flo[:, :, [2,1,0]]) # /255.0
38 |   
39 | 
40 | def raft_flow_extraction_runner(args):
41 |     # 0. Define RAFT model
42 |     model = torch.nn.DataParallel(RAFT())
43 |     model.load_state_dict(torch.load(args.raft_checkpoint_path, map_location='cpu'))
44 | 
45 |     model = model.module
46 |     model.to(device)
47 |     model.eval()
48 |     os.makedirs(os.path.join(args.outdir, 'iterative_warping', 'optical_flows'), exist_ok=True)
49 | 
50 |     # 1. Load in frames path
51 |     frames_path = []
52 |     for path in sorted(os.listdir(args.source_video_frames)):
53 |         frames_path.append(os.path.join(args.source_video_frames, path))
54 |     
55 | 
56 |     # 2. Start extracting optical flows
57 |     with torch.no_grad():
58 |         for index in range(min(args.n_sample_frames, len(frames_path))):
59 |             if index + 1 < len(frames_path):
60 |                 image1 = load_image(frames_path[index + 1])
61 |                 image2 = load_image(frames_path[index])
62 | 
63 |                 padder = InputPadder(image1.shape)
64 |                 image1, image2 = padder.pad(image1, image2)
65 | 
66 |                 flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
67 |                 np.save(os.path.join(args.outdir, 'iterative_warping', 'optical_flows', f'{index:05d}'), flow_up.cpu())
68 |                 


--------------------------------------------------------------------------------
/runners/iterative_warping/run_warp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import torch
 4 | import os
 5 | import numpy as np
 6 | from typing import List
 7 | from io import BytesIO
 8 | from runners.iterative_warping.warp_utils import optical_flow_warping
 9 | 
10 | 
11 | def images_to_gif_bytes(images: List, duration: int = 1000) -> bytes:
12 |     with BytesIO() as output_buffer:
13 |         # Save the first image
14 |         images[0].save(output_buffer,
15 |                        format='GIF',
16 |                        save_all=True,
17 |                        append_images=images[1:],
18 |                        duration=duration,
19 |                        loop=0)  # 0 means the GIF will loop indefinitely
20 | 
21 |         # Get the byte array from the buffer
22 |         gif_bytes = output_buffer.getvalue()
23 | 
24 |     return gif_bytes
25 | 
26 | 
27 | def save_as_gif(images: List, file_path: str, duration: int = 1000):
28 |     with open(file_path, "wb") as f:
29 |         f.write(images_to_gif_bytes(images, duration))
30 | 
31 | def warp(init_frame_path, flows):
32 |     
33 |     # 2. Load in initial frame
34 |     init_frame = cv2.imread(init_frame_path)
35 |     init_frame = cv2.resize(init_frame, (W, H))                               # Resize to make sure that resolution is aligned
36 |     init_frame = init_frame / 255.0
37 |     init_frame = torch.from_numpy(init_frame).float()
38 |     init_frame = init_frame.permute(2, 0, 1).unsqueeze(0)
39 |     
40 |     # 3. Warping
41 |     warped_frames = []
42 |     for index in range(len(optical_flows)):
43 |         current_frame = init_frame if index == 0 else warped_frame_tensor
44 |         if len(current_frame.shape) == 3:
45 |             current_frame = current_frame.unsqueeze(0)
46 |         warped_frame_tensor = optical_flow_warping(current_frame, optical_flows[index])[0]
47 |         warped_frame = warped_frame_tensor.permute(1, 2, 0).numpy()
48 |         warped_frames.append(warped_frame * 255)
49 |         cv2.imwrite(os.path.join(args.outdir, f'{index:05d}.png'), warped_frame * 255)
50 |         
51 |     # TODO: 4. Save gif output
52 |     # pil_warped_frames = []
53 |     # for warped_frame in warped_frames:
54 |     #     pil_warped_frame = Image.fromarray(warped_frame)
55 |     #     pil_warped_frames.append(pil_warped_frame)
56 |     # save_as_gif(pil_warped_frames, os.path.join(args.outdir, 'result.gif'))
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser()
60 |     parser.add_argument('--init-frame', type=str, default='')
61 |     parser.add_argument('--optical-flow', type=str, default='')
62 |     parser.add_argument('--outdir', type=str, default='warped-outputs')
63 |     args = parser.parse_args()
64 | 
65 |      # 0. Create directories
66 |     os.makedirs(args.outdir, exist_ok=True)
67 | 
68 |     # 1. Load in pre-extracted optical flows
69 |     optical_flow_paths = os.listdir(args.optical_flow)
70 |     optical_flows = []
71 |     for optical_flow_path in optical_flow_paths:
72 |         optical_flow = np.load(os.path.join(args.optical_flow, optical_flow_path))
73 |         # optical_flow = cv2.medianBlur(optical_flow, ksize=23)
74 |         optical_flow = torch.from_numpy(optical_flow)
75 |         optical_flows.append(optical_flow)
76 |     _, C, H, W = optical_flows[0].shape
77 | 
78 | 
79 |     warp(init_frame_path=args.init_frame,
80 |          flows=optical_flows)


--------------------------------------------------------------------------------
/runners/iterative_warping_runner.py:
--------------------------------------------------------------------------------
 1 | from runners.iterative_warping.run_warp_with_averaged_flow import iterative_warp_with_averaged_flow
 2 | from runners.iterative_warping.get_averaged_depths import get_averaged_depths_main_func
 3 | 
 4 | def iterative_warping_runner(args):
 5 |     # 1. Get averaged flows
 6 |     iterative_warp_with_averaged_flow(args)
 7 | 
 8 |     # 2. Get averaged depths
 9 |     get_averaged_depths_main_func(args)
10 |     
11 | 


--------------------------------------------------------------------------------
/runners/midas_depth_estimation_runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from PIL import Image
 4 | from tqdm import tqdm
 5 | from models.midas.midas import DepthMidas
 6 | 
 7 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 8 | 
 9 | def midas_depth_estimation_runner(args):
10 |     depth_dir = os.path.join(args.outdir, 'iterative_warping', 'depth_maps')
11 |     os.makedirs(depth_dir, exist_ok=True)
12 | 
13 |     depth_estimator = DepthMidas(model_path=args.midas_checkpoint_path, device=device)
14 | 
15 |     # Get all image files
16 |     image_files = [f for f in sorted(os.listdir(args.source_video_frames))[:args.n_sample_frames] if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
17 | 
18 |     progress_bar = tqdm(total=len(image_files))
19 |     for image_file in image_files:
20 |         progress_bar.update(1)
21 |         
22 |         # Load image
23 |         image_path = os.path.join(args.source_video_frames, image_file)
24 |         image = Image.open(image_path).convert('RGB')
25 |         
26 |         # Estimate depth
27 |         depth = depth_estimator.estimate([image])[0]
28 |         
29 |         # Save depth map
30 |         output_path = os.path.join(depth_dir, f"{image_file}")
31 |         depth.save(output_path)
32 | 
33 |     progress_bar.close()
34 | 
35 | 


--------------------------------------------------------------------------------
/runners/paint_by_example_inference_runner.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os
 3 | import PIL
 4 | import torch
 5 | import numpy as np
 6 | from diffusers import PaintByExamplePipeline
 7 | 
 8 | device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 9 | 
10 | def paint_by_example_inference_runner(args):
11 |     model_path = args.paint_by_example_checkpoint_path
12 |     reference_image_path = args.external_guidance
13 |     outdir = args.outdir
14 |     height = args.height
15 |     width = args.width
16 | 
17 |     # Create output directory if not existed
18 |     os.makedirs(outdir, exist_ok=True)
19 | 
20 |     # Prepare inputs
21 |     image_path = sorted(os.listdir(args.source_video_frames))[0]
22 |     mask_path = sorted(os.listdir(args.input_masks))[0]
23 |     init_image = PIL.Image.open(os.path.join(args.source_video_frames, image_path)).resize((height, width))
24 |     mask_image = PIL.Image.open(os.path.join(args.input_masks, mask_path)).resize((height, width))
25 |     reference_image = PIL.Image.open(reference_image_path).resize((height, width))
26 | 
27 | 
28 |     # Dilate the mask to ensure that it covers the original object
29 |     mask_np = np.array(mask_image)
30 |     kernel = np.ones((args.kernel_size, args.kernel_size), np.uint8)
31 |     dilated_mask = cv2.dilate(mask_np, kernel, iterations=args.dilation_iteration)
32 |     mask_image = PIL.Image.fromarray(dilated_mask)
33 | 
34 | 
35 | 
36 |     # Prepare pipeline
37 |     torch_dtype = torch.float32
38 |     if args.mixed_precision == "fp32":
39 |         torch_dtype = torch.float32
40 |     elif args.mixed_precision == "fp16":
41 |         torch_dtype = torch.float16
42 |     elif args.mixed_precision == "bf16":
43 |         torch_dtype = torch.bfloat16
44 |     pipe = PaintByExamplePipeline.from_pretrained(
45 |         model_path,
46 |         torch_dtype=torch_dtype,
47 |     )
48 |     pipe = pipe.to(device)
49 | 
50 |     # Send inputs into the pipeline
51 |     image = pipe(image=init_image,
52 |                  mask_image=mask_image,
53 |                  example_image=reference_image,
54 |                  guidance_scale=args.guidance_scale,
55 |                  negative_prompt=args.negative_prompt).images[0]
56 | 
57 |     # Save image
58 |     save_path = os.path.join(outdir, 'image_editing_results')
59 |     os.makedirs(save_path, exist_ok=True)
60 |     filename = args.prompt.lower().replace('.', '').replace(' ', '_')
61 |     image.save(os.path.join(save_path, f'{filename}.png'))
62 | 


--------------------------------------------------------------------------------
/runners/stable_diffusion_inpaint_inference_runner.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import cv2
 4 | import numpy as np
 5 | from PIL import Image
 6 | from diffusers import StableDiffusionInpaintPipeline
 7 | 
 8 | 
 9 | 
10 | def stable_diffusion_inpaint_inference_runner(args):
11 |     device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
12 | 
13 |     # Define weight dtype
14 |     if args.mixed_precision == "fp16":
15 |         weight_dtype = torch.float16
16 |     elif args.mixed_precision == "bf16":
17 |         weight_dtype = torch.bfloat16
18 |     else:
19 |         weight_dtype = torch.float32
20 | 
21 |     pipe = StableDiffusionInpaintPipeline.from_pretrained(
22 |         args.stable_diffusion_inpaint_checkpoint_path,
23 |         torch_dtype=weight_dtype,
24 |         safety_checker=None,
25 |     )
26 |     pipe = pipe.to(device, dtype=weight_dtype)
27 | 
28 |     # Load the first frame from the source video frames
29 |     image = Image.open(os.path.join(args.source_video_frames, sorted(os.listdir(args.source_video_frames))[0])).convert("RGB")
30 |     # Load the first mask from the input masks
31 |     mask = Image.open(os.path.join(args.input_masks, sorted(os.listdir(args.input_masks))[0])).convert("RGB")
32 | 
33 |     # Convert mask to numpy array
34 |     mask_np = np.array(mask)
35 | 
36 |     # Create a kernel for dilation
37 |     kernel = np.ones((19, 19), np.uint8)
38 | 
39 |     # Dilate the mask
40 |     dilated_mask = cv2.dilate(mask_np, kernel, iterations=9)
41 | 
42 |     # Convert back to PIL Image
43 |     mask_image = Image.fromarray(dilated_mask)
44 | 
45 |     generator = torch.Generator().manual_seed(args.seed)
46 | 
47 |     output_image = pipe(
48 |         prompt=args.prompt,
49 |         image=image,
50 |         mask_image=mask_image,
51 |         negative_prompt=args.negative_prompt,
52 |         num_inference_steps=args.num_inference_steps,
53 |         guidance_scale=args.guidance_scale,
54 |         height=args.height,
55 |         width=args.width,
56 |         generator=generator
57 |     ).images[0]
58 | 
59 |     # Save image
60 |     save_path = os.path.join(args.outdir, 'image_editing_results')
61 |     os.makedirs(save_path, exist_ok=True)
62 |     filename = args.prompt.replace('.', '').replace(' ', '_')
63 |     output_image.save(os.path.join(save_path, f'{filename}.png'))


--------------------------------------------------------------------------------
/scripts/extract_youtube_vos_depths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import argparse
 4 | from PIL import Image
 5 | from tqdm import tqdm
 6 | from models.midas.midas import DepthMidas
 7 | 
 8 | device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--midas-path', type=str, default='', help='Path to MiDaS model weights')
12 | parser.add_argument('--dataset-path', type=str, default='input_image_examples', help='Path to input image dataset')
13 | parser.add_argument('--outdir', type=str, default='output_depth_examples', help='Output directory for depth maps')
14 | 
15 | args = parser.parse_args()
16 | 
17 | device = args.device
18 | midas_path = args.midas_path
19 | dataset_path = args.dataset_path
20 | outdir = args.outdir
21 | os.makedirs(outdir, exist_ok=True)
22 | 
23 | depth_estimator = DepthMidas(model_path=midas_path,
24 |                              device=device)
25 | video_paths = os.listdir(dataset_path)
26 | progress_bar = tqdm(total=len(video_paths))
27 | for video_path in video_paths:
28 |     progress_bar.update(1)
29 |     os.makedirs(os.path.join(outdir, video_path), exist_ok=True)
30 |     frame_paths = os.listdir(os.path.join(dataset_path, video_path))
31 |     frames_pil = []
32 |     frame_name_list = []
33 |     for frame_path in frame_paths:
34 |         video_frame = Image.open(os.path.join(dataset_path, video_path, frame_path))
35 |         frames_pil.append(video_frame)
36 |         frame_name_list.append(frame_path.split('.')[0])
37 |     depths_pil = depth_estimator.estimate(frames_pil)
38 |     for depth, frame_name in zip(depths_pil, frame_name_list):
39 |         depth.save(os.path.join(outdir, video_path, frame_name + '.png'))


--------------------------------------------------------------------------------
/scripts/run_dilate_mask.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | def dilate_mask(mask, kernel_size=11, iterations=1):
 7 |     kernel = np.ones((kernel_size, kernel_size), np.uint8)
 8 |     dilated_mask = cv2.dilate(mask, kernel, iterations=iterations)
 9 |     return dilated_mask
10 | 
11 | def process_masks(input_folder, output_folder, kernel_size=5, iterations=1):
12 |     # Create output folder if it doesn't exist
13 |     os.makedirs(output_folder, exist_ok=True)
14 | 
15 |     # Get all files in the input folder
16 |     if input_folder.endswith('.png') or input_folder.endswith('.jpg'):
17 |         mask_files = [input_folder]
18 |     else:
19 |         mask_files = [f for f in os.listdir(input_folder) if f.endswith('.png') or f.endswith('.jpg')]
20 | 
21 |     for mask_file in tqdm(mask_files, desc="Processing masks"):
22 |         # Read the mask
23 |         mask_path = os.path.join(input_folder, mask_file)
24 |         mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
25 | 
26 |         # Dilate the mask
27 |         dilated_mask = dilate_mask(mask, kernel_size, iterations)
28 |         
29 | 
30 |         # Save the dilated mask
31 |         output_path = os.path.join(output_folder, mask_file)
32 |         cv2.imwrite(output_path, dilated_mask)
33 | 
34 | if __name__ == "__main__":
35 |     import argparse
36 | 
37 |     parser = argparse.ArgumentParser(description="Dilate masks and save them in a new folder")
38 |     parser.add_argument("--input-folder", type=str, required=True, help="Path to the folder containing input masks")
39 |     parser.add_argument("--output-folder", type=str, required=True, help="Path to the folder to save dilated masks")
40 |     parser.add_argument("--kernel-size", type=int, default=15, help="Kernel size for dilation (default: 5)")
41 |     parser.add_argument("--iterations", type=int, default=1, help="Number of dilation iterations (default: 1)")
42 | 
43 |     args = parser.parse_args()
44 | 
45 |     process_masks(args.input_folder, args.output_folder, args.kernel_size, args.iterations)
46 |     print(f"Dilated masks saved in {args.output_folder}.")
47 | 


--------------------------------------------------------------------------------
/utils/__pycache__/file_client.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/file_client.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/flow_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/flow_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/loss_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/loss_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/lr_scheduler_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/lr_scheduler_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/mask_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/mask_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlonzoLeeeooo/StableV2V/589106ae90e66b709d34fd61a03b869c52e5e114/utils/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------