├── .gitignore
├── demo.png
├── output.gif
├── .dockerignore
├── README.md
├── weights_downloader.py
├── cog.yaml
├── sizing_strategy.py
├── svd.yaml
├── svd_xt.yaml
├── LICENSE
└── predict.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .cog
3 | checkpoints
4 | output
5 | *.mp4
6 | 


--------------------------------------------------------------------------------
/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/replicate/cog-svd/HEAD/demo.png


--------------------------------------------------------------------------------
/output.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/replicate/cog-svd/HEAD/output.gif


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # The .dockerignore file excludes files from the container build process.
 2 | #
 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file
 4 | 
 5 | checkpoints/
 6 | 
 7 | # Exclude Git files
 8 | .git
 9 | .github
10 | .gitignore
11 | 
12 | # Exclude Python cache files
13 | __pycache__
14 | .mypy_cache
15 | .pytest_cache
16 | .ruff_cache
17 | 
18 | # Exclude Python virtual environment
19 | /venv
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cog-SDV
 2 | 
 3 | This is an implementation of Stability AI's [SDV](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) as a [Cog](https://github.com/replicate/cog) model.
 4 | 
 5 | ## Development
 6 | 
 7 | Follow the [model pushing guide](https://replicate.com/docs/guides/push-a-model) to push your own fork of SDXL to [Replicate](https://replicate.com).
 8 | 
 9 | ## Basic Usage
10 | 
11 | Run a prediction:
12 | 
13 |     cog predict -i input_image=@demo.png
14 | 
15 | ## Output
16 | 
17 | ![sample1](output.gif)


--------------------------------------------------------------------------------
/weights_downloader.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import time
 3 | import os
 4 | 
 5 | 
 6 | class WeightsDownloader:
 7 |     @staticmethod
 8 |     def download_if_not_exists(url, dest):
 9 |         if not os.path.exists(dest):
10 |             WeightsDownloader.download(url, dest)
11 | 
12 |     @staticmethod
13 |     def download(url, dest):
14 |         start = time.time()
15 |         print("downloading url: ", url)
16 |         print("downloading to: ", dest)
17 |         subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
18 |         print("downloading took: ", time.time() - start)
19 | 


--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | build:
 4 |   gpu: true
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |     - "ffmpeg"
 9 |   python_version: "3.10"
10 |   python_packages:
11 |     - "black==23.7.0"
12 |     - "chardet==5.1.0"
13 |     - "clip @ git+https://github.com/openai/CLIP.git"
14 |     - "einops>=0.6.1"
15 |     - "fairscale>=0.4.13"
16 |     - "fsspec>=2023.6.0"
17 |     - "invisible-watermark>=0.2.0"
18 |     - "kornia==0.6.9"
19 |     - "matplotlib>=3.7.2"
20 |     - "natsort>=8.4.0"
21 |     - "ninja>=1.11.1"
22 |     - "numpy>=1.24.4"
23 |     - "omegaconf>=2.3.0"
24 |     - "open-clip-torch>=2.20.0"
25 |     - "opencv-python==4.6.0.66"
26 |     - "pandas>=2.0.3"
27 |     - "pillow>=9.5.0"
28 |     - "pudb>=2022.1.3"
29 |     - "pytorch-lightning==2.0.1"
30 |     - "pyyaml>=6.0.1"
31 |     - "scipy>=1.10.1"
32 |     - "streamlit>=0.73.1"
33 |     - "tensorboardx==2.6"
34 |     - "timm>=0.9.2"
35 |     - "tokenizers==0.12.1"
36 |     - "torch>=2.0.1"
37 |     - "torchaudio>=2.0.2"
38 |     - "torchdata==0.6.1"
39 |     - "torchmetrics>=1.0.1"
40 |     - "torchvision>=0.15.2"
41 |     - "tqdm>=4.65.0"
42 |     - "transformers==4.19.1"
43 |     - "triton==2.0.0"
44 |     - "urllib3<1.27,>=1.25.4"
45 |     - "wandb>=0.15.6"
46 |     - "webdataset>=0.2.33"
47 |     - "wheel>=0.41.0"
48 |     - "xformers>=0.0.20"
49 |     - "git+https://github.com/Stability-AI/generative-models.git"
50 | 
51 |   run:
52 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.6/pget" && chmod +x /usr/local/bin/pget
53 | 
54 | # predict.py defines how predictions are run on your model
55 | predict: "predict.py:Predictor"
56 | 


--------------------------------------------------------------------------------
/sizing_strategy.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | 
  3 | MAX_W_DIMENSION = 1024
  4 | MAX_H_DIMENSION = 576
  5 | 
  6 | 
  7 | class SizingStrategy:
  8 |     def __init__(self):
  9 |         pass
 10 | 
 11 |     def maintain_aspect_ratio(self, width, height):
 12 |         aspect_ratio = width / height
 13 | 
 14 |         if aspect_ratio >= 1:  # Width is the limiting factor
 15 |             new_width = min(width, MAX_W_DIMENSION)
 16 |             new_height = int(new_width / aspect_ratio)
 17 |         else:  # Height is the limiting factor
 18 |             new_height = min(height, MAX_H_DIMENSION)
 19 |             new_width = int(new_height * aspect_ratio)
 20 | 
 21 |         # Ensure neither dimension exceeds the maximum
 22 |         if new_height > MAX_H_DIMENSION:
 23 |             new_height = MAX_H_DIMENSION
 24 |             new_width = int(new_height * aspect_ratio)
 25 | 
 26 |         if new_width > MAX_W_DIMENSION:
 27 |             new_width = MAX_W_DIMENSION
 28 |             new_height = int(new_width / aspect_ratio)
 29 | 
 30 |         # Adjust to be divisible by 64
 31 |         new_width -= new_width % 64
 32 |         new_height -= new_height % 64
 33 | 
 34 |         return new_width, new_height
 35 | 
 36 |     def resize_and_crop(self, width, height, image):
 37 |         # Determine which dimension is less constraining
 38 |         scale_factor_w = MAX_W_DIMENSION / width
 39 |         scale_factor_h = MAX_H_DIMENSION / height
 40 | 
 41 |         print(f"Scale factor w: {scale_factor_w}, Scale factor h: {scale_factor_h}")
 42 | 
 43 |         # Scale up/down based on the less constraining dimension
 44 |         if scale_factor_w < scale_factor_h:
 45 |             # Height is less constraining
 46 |             new_height = MAX_H_DIMENSION
 47 |             new_width = int(width * scale_factor_h)
 48 |         else:
 49 |             # Width is less constraining
 50 |             new_width = MAX_W_DIMENSION
 51 |             new_height = int(height * scale_factor_w)
 52 | 
 53 |         print(f"New width: {new_width}, New height: {new_height}")
 54 | 
 55 |         # Resize the image
 56 |         resized_image = self.resize_image(image, new_width, new_height)
 57 | 
 58 |         # Calculate cropping dimensions
 59 |         left = max((new_width - MAX_W_DIMENSION) / 2, 0)
 60 |         top = max((new_height - MAX_H_DIMENSION) / 2, 0)
 61 |         right = left + MAX_W_DIMENSION
 62 |         bottom = top + MAX_H_DIMENSION
 63 | 
 64 |         print(f"Left: {left}, Top: {top}, Right: {right}, Bottom: {bottom}")
 65 | 
 66 |         # Crop the image to 1024x576
 67 |         cropped_image = resized_image.crop((left, top, right, bottom))
 68 | 
 69 |         print("Resized and cropped dimensions: 1024x576")
 70 |         return cropped_image
 71 | 
 72 |     def get_dimensions(self, image):
 73 |         return image.size
 74 | 
 75 |     def resize_image(self, image, width, height):
 76 |         return image.resize((width, height)) if image is not None else None
 77 | 
 78 |     def open_image(self, image_path):
 79 |         return Image.open(str(image_path)) if image_path is not None else None
 80 | 
 81 |     def divisible_by_64(self, image):
 82 |         width, height = image.size
 83 |         print(f"Original dimensions: {width}x{height}")
 84 |         if height % 64 != 0 or width % 64 != 0:
 85 |             width, height = map(lambda x: x - x % 64, (width, height))
 86 |             print(
 87 |                 f"WARNING: Your image is not divisible by 64 – resizing to {width}x{height}"
 88 |             )
 89 |         return width, height
 90 | 
 91 |     def apply(
 92 |         self,
 93 |         sizing_strategy,
 94 |         image=None,
 95 |     ):
 96 |         image = self.open_image(image)
 97 |         width, height = self.get_dimensions(image)
 98 | 
 99 |         if sizing_strategy == "crop_to_16_9":
100 |             print("Resizing and cropping to 16:9")
101 |             return self.resize_and_crop(width, height, image)
102 |         elif sizing_strategy == "maintain_aspect_ratio":
103 |             print("Resizing but keeping aspect ratio")
104 |             width, height = self.maintain_aspect_ratio(width, height)
105 |         else:
106 |             print("Using image dimensions")
107 |             width, height = self.divisible_by_64(image)
108 | 
109 |         resized_image = self.resize_image(
110 |             image,
111 |             width,
112 |             height,
113 |         )
114 | 
115 |         print(f"Using dimensions {width}x{height}")
116 |         return resized_image
117 | 


--------------------------------------------------------------------------------
/svd.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   target: sgm.models.diffusion.DiffusionEngine
  3 |   params:
  4 |     scale_factor: 0.18215
  5 |     disable_first_stage_autocast: True
  6 |     ckpt_path: checkpoints/svd.safetensors
  7 | 
  8 |     denoiser_config:
  9 |       target: sgm.modules.diffusionmodules.denoiser.Denoiser
 10 |       params:
 11 |         scaling_config:
 12 |           target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
 13 | 
 14 |     network_config:
 15 |       target: sgm.modules.diffusionmodules.video_model.VideoUNet
 16 |       params:
 17 |         adm_in_channels: 768
 18 |         num_classes: sequential
 19 |         use_checkpoint: True
 20 |         in_channels: 8
 21 |         out_channels: 4
 22 |         model_channels: 320
 23 |         attention_resolutions: [4, 2, 1]
 24 |         num_res_blocks: 2
 25 |         channel_mult: [1, 2, 4, 4]
 26 |         num_head_channels: 64
 27 |         use_linear_in_transformer: True
 28 |         transformer_depth: 1
 29 |         context_dim: 1024
 30 |         spatial_transformer_attn_type: softmax-xformers
 31 |         extra_ff_mix_layer: True
 32 |         use_spatial_context: True
 33 |         merge_strategy: learned_with_images
 34 |         video_kernel_size: [3, 1, 1]
 35 | 
 36 |     conditioner_config:
 37 |       target: sgm.modules.GeneralConditioner
 38 |       params:
 39 |         emb_models:
 40 |         - is_trainable: False
 41 |           input_key: cond_frames_without_noise
 42 |           target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
 43 |           params:
 44 |             n_cond_frames: 1
 45 |             n_copies: 1
 46 |             open_clip_embedding_config:
 47 |               target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
 48 |               params:
 49 |                 freeze: True
 50 | 
 51 |         - input_key: fps_id
 52 |           is_trainable: False
 53 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 54 |           params:
 55 |             outdim: 256
 56 | 
 57 |         - input_key: motion_bucket_id
 58 |           is_trainable: False
 59 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 60 |           params:
 61 |             outdim: 256
 62 | 
 63 |         - input_key: cond_frames
 64 |           is_trainable: False
 65 |           target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
 66 |           params:
 67 |             disable_encoder_autocast: True
 68 |             n_cond_frames: 1
 69 |             n_copies: 1
 70 |             is_ae: True
 71 |             encoder_config:
 72 |               target: sgm.models.autoencoder.AutoencoderKLModeOnly
 73 |               params:
 74 |                 embed_dim: 4
 75 |                 monitor: val/rec_loss
 76 |                 ddconfig:
 77 |                   attn_type: vanilla-xformers
 78 |                   double_z: True
 79 |                   z_channels: 4
 80 |                   resolution: 256
 81 |                   in_channels: 3
 82 |                   out_ch: 3
 83 |                   ch: 128
 84 |                   ch_mult: [1, 2, 4, 4]
 85 |                   num_res_blocks: 2
 86 |                   attn_resolutions: []
 87 |                   dropout: 0.0
 88 |                 lossconfig:
 89 |                   target: torch.nn.Identity
 90 | 
 91 |         - input_key: cond_aug
 92 |           is_trainable: False
 93 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 94 |           params:
 95 |             outdim: 256
 96 | 
 97 |     first_stage_config:
 98 |       target: sgm.models.autoencoder.AutoencodingEngine
 99 |       params:
100 |         loss_config:
101 |           target: torch.nn.Identity
102 |         regularizer_config:
103 |           target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
104 |         encoder_config: 
105 |           target: sgm.modules.diffusionmodules.model.Encoder
106 |           params:
107 |             attn_type: vanilla
108 |             double_z: True
109 |             z_channels: 4
110 |             resolution: 256
111 |             in_channels: 3
112 |             out_ch: 3
113 |             ch: 128
114 |             ch_mult: [1, 2, 4, 4]
115 |             num_res_blocks: 2
116 |             attn_resolutions: []
117 |             dropout: 0.0
118 |         decoder_config:
119 |           target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
120 |           params:
121 |             attn_type: vanilla
122 |             double_z: True
123 |             z_channels: 4
124 |             resolution: 256
125 |             in_channels: 3
126 |             out_ch: 3
127 |             ch: 128
128 |             ch_mult: [1, 2, 4, 4]
129 |             num_res_blocks: 2
130 |             attn_resolutions: []
131 |             dropout: 0.0
132 |             video_kernel_size: [3, 1, 1]
133 | 
134 |     sampler_config:
135 |       target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
136 |       params:
137 |         discretization_config:
138 |           target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
139 |           params:
140 |             sigma_max: 700.0
141 | 
142 |         guider_config:
143 |           target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
144 |           params:
145 |             max_scale: 2.5
146 |             min_scale: 1.0


--------------------------------------------------------------------------------
/svd_xt.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   target: sgm.models.diffusion.DiffusionEngine
  3 |   params:
  4 |     scale_factor: 0.18215
  5 |     disable_first_stage_autocast: True
  6 |     ckpt_path: checkpoints/svd_xt.safetensors
  7 | 
  8 |     denoiser_config:
  9 |       target: sgm.modules.diffusionmodules.denoiser.Denoiser
 10 |       params:
 11 |         scaling_config:
 12 |           target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
 13 | 
 14 |     network_config:
 15 |       target: sgm.modules.diffusionmodules.video_model.VideoUNet
 16 |       params:
 17 |         adm_in_channels: 768
 18 |         num_classes: sequential
 19 |         use_checkpoint: True
 20 |         in_channels: 8
 21 |         out_channels: 4
 22 |         model_channels: 320
 23 |         attention_resolutions: [4, 2, 1]
 24 |         num_res_blocks: 2
 25 |         channel_mult: [1, 2, 4, 4]
 26 |         num_head_channels: 64
 27 |         use_linear_in_transformer: True
 28 |         transformer_depth: 1
 29 |         context_dim: 1024
 30 |         spatial_transformer_attn_type: softmax-xformers
 31 |         extra_ff_mix_layer: True
 32 |         use_spatial_context: True
 33 |         merge_strategy: learned_with_images
 34 |         video_kernel_size: [3, 1, 1]
 35 | 
 36 |     conditioner_config:
 37 |       target: sgm.modules.GeneralConditioner
 38 |       params:
 39 |         emb_models:
 40 |         - is_trainable: False
 41 |           input_key: cond_frames_without_noise
 42 |           target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
 43 |           params:
 44 |             n_cond_frames: 1
 45 |             n_copies: 1
 46 |             open_clip_embedding_config:
 47 |               target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
 48 |               params:
 49 |                 freeze: True
 50 | 
 51 |         - input_key: fps_id
 52 |           is_trainable: False
 53 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 54 |           params:
 55 |             outdim: 256
 56 | 
 57 |         - input_key: motion_bucket_id
 58 |           is_trainable: False
 59 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 60 |           params:
 61 |             outdim: 256
 62 | 
 63 |         - input_key: cond_frames
 64 |           is_trainable: False
 65 |           target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
 66 |           params:
 67 |             disable_encoder_autocast: True
 68 |             n_cond_frames: 1
 69 |             n_copies: 1
 70 |             is_ae: True
 71 |             encoder_config:
 72 |               target: sgm.models.autoencoder.AutoencoderKLModeOnly
 73 |               params:
 74 |                 embed_dim: 4
 75 |                 monitor: val/rec_loss
 76 |                 ddconfig:
 77 |                   attn_type: vanilla-xformers
 78 |                   double_z: True
 79 |                   z_channels: 4
 80 |                   resolution: 256
 81 |                   in_channels: 3
 82 |                   out_ch: 3
 83 |                   ch: 128
 84 |                   ch_mult: [1, 2, 4, 4]
 85 |                   num_res_blocks: 2
 86 |                   attn_resolutions: []
 87 |                   dropout: 0.0
 88 |                 lossconfig:
 89 |                   target: torch.nn.Identity
 90 | 
 91 |         - input_key: cond_aug
 92 |           is_trainable: False
 93 |           target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
 94 |           params:
 95 |             outdim: 256
 96 | 
 97 |     first_stage_config:
 98 |       target: sgm.models.autoencoder.AutoencodingEngine
 99 |       params:
100 |         loss_config:
101 |           target: torch.nn.Identity
102 |         regularizer_config:
103 |           target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
104 |         encoder_config: 
105 |           target: sgm.modules.diffusionmodules.model.Encoder
106 |           params:
107 |             attn_type: vanilla
108 |             double_z: True
109 |             z_channels: 4
110 |             resolution: 256
111 |             in_channels: 3
112 |             out_ch: 3
113 |             ch: 128
114 |             ch_mult: [1, 2, 4, 4]
115 |             num_res_blocks: 2
116 |             attn_resolutions: []
117 |             dropout: 0.0
118 |         decoder_config:
119 |           target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
120 |           params:
121 |             attn_type: vanilla
122 |             double_z: True
123 |             z_channels: 4
124 |             resolution: 256
125 |             in_channels: 3
126 |             out_ch: 3
127 |             ch: 128
128 |             ch_mult: [1, 2, 4, 4]
129 |             num_res_blocks: 2
130 |             attn_resolutions: []
131 |             dropout: 0.0
132 |             video_kernel_size: [3, 1, 1]
133 | 
134 |     sampler_config:
135 |       target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
136 |       params:
137 |         discretization_config:
138 |           target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
139 |           params:
140 |             sigma_max: 700.0
141 | 
142 |         guider_config:
143 |           target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
144 |           params:
145 |             max_scale: 3.0
146 |             min_scale: 1.5


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2023, Replicate, Inc.
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | # Prediction interface for Cog ⚙️
  2 | # https://github.com/replicate/cog/blob/main/docs/python.md
  3 | 
  4 | from cog import BasePredictor, Input, Path
  5 | import os
  6 | import cv2
  7 | import math
  8 | import torch
  9 | import numpy as np
 10 | from PIL import Image
 11 | from glob import glob
 12 | from typing import Optional
 13 | from omegaconf import OmegaConf
 14 | from einops import rearrange, repeat
 15 | from torchvision.transforms import ToTensor
 16 | from sgm.inference.helpers import embed_watermark
 17 | from sgm.util import default, instantiate_from_config
 18 | from sizing_strategy import SizingStrategy
 19 | from weights_downloader import WeightsDownloader
 20 | 
 21 | """Exported from stability/ai generative-models """
 22 | 
 23 | 
 24 | def get_unique_embedder_keys_from_conditioner(conditioner):
 25 |     return list(set([x.input_key for x in conditioner.embedders]))
 26 | 
 27 | 
 28 | def get_batch(keys, value_dict, N, T, device, dtype=None):
 29 |     batch = {}
 30 |     batch_uc = {}
 31 |     for key in keys:
 32 |         if key == "fps_id":
 33 |             batch[key] = (
 34 |                 torch.tensor([value_dict["fps_id"]])
 35 |                 .to(device, dtype=dtype)
 36 |                 .repeat(int(math.prod(N)))
 37 |             )
 38 |         elif key == "motion_bucket_id":
 39 |             batch[key] = (
 40 |                 torch.tensor([value_dict["motion_bucket_id"]])
 41 |                 .to(device, dtype=dtype)
 42 |                 .repeat(int(math.prod(N)))
 43 |             )
 44 |         elif key == "cond_aug":
 45 |             batch[key] = repeat(
 46 |                 torch.tensor([value_dict["cond_aug"]]).to(device, dtype=dtype),
 47 |                 "1 -> b",
 48 |                 b=math.prod(N),
 49 |             )
 50 |         elif key == "cond_frames":
 51 |             batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
 52 |         elif key == "cond_frames_without_noise":
 53 |             batch[key] = repeat(
 54 |                 value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
 55 |             )
 56 |         else:
 57 |             batch[key] = value_dict[key]
 58 | 
 59 |     if T is not None:
 60 |         batch["num_video_frames"] = T
 61 |     for key in batch.keys():
 62 |         if key not in batch_uc and isinstance(batch[key], torch.Tensor):
 63 |             batch_uc[key] = torch.clone(batch[key])
 64 |     return batch, batch_uc
 65 | 
 66 | 
 67 | def load_model(
 68 |     config: str,
 69 |     device: str,
 70 |     num_frames: int,
 71 |     num_steps: int,
 72 | ):
 73 |     config = OmegaConf.load(config)
 74 |     if device == "cuda":
 75 |         config.model.params.conditioner_config.params.emb_models[
 76 |             0
 77 |         ].params.open_clip_embedding_config.params.init_device = device
 78 | 
 79 |     config.model.params.sampler_config.params.num_steps = num_steps
 80 |     config.model.params.sampler_config.params.guider_config.params.num_frames = (
 81 |         num_frames
 82 |     )
 83 |     if device == "cuda":
 84 |         with torch.device(device):
 85 |             model = instantiate_from_config(config.model).to(device).eval().requires_grad_(False)
 86 |     else:
 87 |         model = instantiate_from_config(config.model).to(device).eval()
 88 | 
 89 |     # FP16
 90 |     model.conditioner.cpu()
 91 |     model.first_stage_model.cpu()
 92 |     model.model.to(dtype=torch.float16)
 93 |     torch.cuda.empty_cache()
 94 |     model = model.requires_grad_(False)
 95 |     return model
 96 | 
 97 | 
 98 | SVD_MODEL_CACHE = "./checkpoints"
 99 | SVD_URL = "https://weights.replicate.delivery/default/svd/svd_and_svd_xt.tar"
100 | 
101 | SVD_DEFAULT_FRAMES = 14
102 | SVD_DEFAULT_STEPS = 25
103 | 
104 | SVD_XT_DEFAULT_FRAMES = 25
105 | SVD_XT_DEFAULT_STEPS = 30
106 | 
107 | class Predictor(BasePredictor):
108 |     def setup(self) -> None:
109 |         """Load the model into memory to make running multiple predictions efficient"""
110 |         self.sizing_strategy = SizingStrategy()
111 |         WeightsDownloader.download_if_not_exists(SVD_URL, SVD_MODEL_CACHE)
112 | 
113 |         self.svd_model = load_model(
114 |             "svd.yaml",
115 |             "cuda",
116 |             SVD_DEFAULT_FRAMES,
117 |             SVD_DEFAULT_STEPS,
118 |         )
119 | 
120 |         self.svd_xt_model = load_model(
121 |             "svd_xt.yaml",
122 |             "cuda",
123 |             SVD_XT_DEFAULT_FRAMES,
124 |             SVD_XT_DEFAULT_STEPS,
125 |         )
126 |         # self.model = torch.load("./weights.pth")
127 |         # TODO: cache & download open_clip_pytorch_model.bin here
128 | 
129 |     def predict(
130 |         self,
131 |         input_image: Path = Input(description="Input image"),
132 |         video_length: str = Input(
133 |             description="Use svd to generate 14 frames or svd_xt for 25 frames",
134 |             choices=[
135 |                 "14_frames_with_svd",
136 |                 "25_frames_with_svd_xt",
137 |             ],
138 |             default="14_frames_with_svd",
139 |         ),
140 |         sizing_strategy: str = Input(
141 |             description="Decide how to resize the input image",
142 |             choices=[
143 |                 "maintain_aspect_ratio",
144 |                 "crop_to_16_9",
145 |                 "use_image_dimensions",
146 |             ],
147 |             default="maintain_aspect_ratio",
148 |         ),
149 |         frames_per_second: int = Input(description="Frames per second", default=6, ge=5, le=30),
150 |         motion_bucket_id: int = Input(
151 |             description="Increase overall motion in the generated video", default=127, ge=1, le=255
152 |         ),
153 |         cond_aug: float = Input(description="Amount of noise to add to input image", default=0.02),
154 |         decoding_t: int = Input(description="Number of frames to decode at a time", default=14),
155 |         seed: int = Input(
156 |             description="Random seed. Leave blank to randomize the seed", default=None
157 |         ),
158 |     ) -> Path:
159 |         """Run a single prediction on the model"""
160 | 
161 |         # Remove individual frame images
162 |         output_folder: Optional[str] = "output/"
163 |         for file_name in glob(os.path.join(output_folder, "*.png")):
164 |             os.remove(file_name)
165 | 
166 |         if seed is None:
167 |             seed = int.from_bytes(os.urandom(2), "big")
168 |         print(f"Using seed: {seed}")
169 |         torch.manual_seed(seed)
170 | 
171 |         image = self.sizing_strategy.apply(sizing_strategy, input_image)
172 | 
173 |         device = "cuda"
174 |         print("Set consts")
175 | 
176 |         if video_length == "14_frames_with_svd":
177 |             model = self.svd_model
178 |             num_frames = SVD_DEFAULT_FRAMES
179 |         else:
180 |             model = self.svd_xt_model
181 |             num_frames = SVD_XT_DEFAULT_FRAMES
182 | 
183 |         print("Loaded model")
184 | 
185 |         output_path = None
186 | 
187 |         if image.mode == "RGBA":
188 |             image = image.convert("RGB")
189 |         image = ToTensor()(image)
190 |         image = image * 2.0 - 1.0
191 | 
192 |         image = image.unsqueeze(0).to(device)
193 |         H, W = image.shape[2:]
194 |         assert image.shape[1] == 3
195 |         F = 8
196 |         C = 4
197 |         shape = (num_frames, C, H // F, W // F)
198 |         if (H, W) != (576, 1024):
199 |             print(
200 |                 "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
201 |             )
202 |         if motion_bucket_id > 255:
203 |             print(
204 |                 "WARNING: High motion bucket! This may lead to suboptimal performance."
205 |             )
206 | 
207 |         if frames_per_second < 5:
208 |             print("WARNING: Small fps value! This may lead to suboptimal performance.")
209 | 
210 |         if frames_per_second > 30:
211 |             print("WARNING: Large fps value! This may lead to suboptimal performance.")
212 | 
213 |         value_dict = {}
214 |         value_dict["motion_bucket_id"] = motion_bucket_id
215 |         value_dict["fps_id"] = frames_per_second
216 |         value_dict["cond_aug"] = cond_aug
217 |         value_dict["cond_frames_without_noise"] = image
218 |         value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
219 |         value_dict["cond_aug"] = cond_aug
220 | 
221 |         # low vram mode
222 |         model.conditioner.cpu()
223 |         model.first_stage_model.cpu()
224 |         torch.cuda.empty_cache()
225 |         model.sampler.verbose = True
226 | 
227 |         with torch.no_grad():
228 |             with torch.autocast(device):
229 |                 model.conditioner.to(device)
230 |                 batch, batch_uc = get_batch(
231 |                     get_unique_embedder_keys_from_conditioner(model.conditioner),
232 |                     value_dict,
233 |                     [1, num_frames],
234 |                     T=num_frames,
235 |                     device=device,
236 |                 )
237 |                 c, uc = model.conditioner.get_unconditional_conditioning(
238 |                     batch,
239 |                     batch_uc=batch_uc,
240 |                     force_uc_zero_embeddings=[
241 |                         "cond_frames",
242 |                         "cond_frames_without_noise",
243 |                     ],
244 |                 )
245 |                 model.conditioner.cpu()
246 |                 torch.cuda.empty_cache()
247 | 
248 |                 # from here, dtype is fp16
249 |                 for k in ["crossattn", "concat"]:
250 |                     uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
251 |                     uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
252 |                     c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
253 |                     c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
254 |                 for k in uc.keys():
255 |                     uc[k] = uc[k].to(dtype=torch.float16)
256 |                     c[k] = c[k].to(dtype=torch.float16)
257 | 
258 |                 randn = torch.randn(shape, device=device, dtype=torch.float16)
259 |                 additional_model_inputs = {}
260 |                 additional_model_inputs["image_only_indicator"] = torch.zeros(2, num_frames).to(device)
261 |                 additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
262 | 
263 |                 for k in additional_model_inputs:
264 |                     if isinstance(additional_model_inputs[k], torch.Tensor):
265 |                         additional_model_inputs[k] = additional_model_inputs[k].to(dtype=torch.float16)
266 | 
267 |                 def denoiser(input, sigma, c):
268 |                     return model.denoiser(
269 |                         model.model, input, sigma, c, **additional_model_inputs
270 |                     )
271 | 
272 |                 samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
273 |                 samples_z.to(dtype=model.first_stage_model.dtype)
274 |                 model.en_and_decode_n_samples_a_time = decoding_t
275 |                 model.first_stage_model.to(device)
276 |                 samples_x = model.decode_first_stage(samples_z)
277 |                 samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
278 |                 model.first_stage_model.cpu()
279 |                 torch.cuda.empty_cache()
280 | 
281 |                 os.makedirs(output_folder, exist_ok=True)
282 |                 base_count = len(glob(os.path.join(output_folder, "*.mp4")))
283 |                 video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
284 |                 output_path = video_path
285 | 
286 |                 samples = embed_watermark(samples)
287 |                 vid = (
288 |                     (rearrange(samples, "t c h w -> t h w c") * 255)
289 |                     .cpu()
290 |                     .numpy()
291 |                     .astype(np.uint8)
292 |                 )
293 |                 # Save frames as individual images
294 |                 for i, frame in enumerate(vid):
295 |                     frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
296 |                     cv2.imwrite(
297 |                         os.path.join(output_folder, f"frame_{i:06d}.png"), frame
298 |                     )
299 | 
300 |                 # Use ffmpeg to create video from images
301 |                 os.system(
302 |                     f"ffmpeg -r {frames_per_second + 1} -i {output_folder}/frame_%06d.png -c:v libx264 -vf 'fps={frames_per_second + 1},format=yuv420p' {video_path}"
303 |                 )
304 | 
305 |                 # Remove individual frame images
306 |                 for file_name in glob(os.path.join(output_folder, "*.png")):
307 |                     os.remove(file_name)
308 | 
309 |         return Path(output_path)
310 | 


--------------------------------------------------------------------------------