├── images ├── 3.png ├── 49.png ├── 78.png ├── 93.png ├── rose.png ├── flower.png ├── img2img_1.png ├── mask_rose.png └── inpaint_rose.png ├── requirements.txt ├── setup.py ├── animation_mode ├── config.py ├── setup.py ├── README.md ├── utility │ └── utils.py └── animation.py ├── LICENSE ├── utils ├── utility.py └── pipeline.py ├── run.py └── README.md /images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/3.png -------------------------------------------------------------------------------- /images/49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/49.png -------------------------------------------------------------------------------- /images/78.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/78.png -------------------------------------------------------------------------------- /images/93.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/93.png -------------------------------------------------------------------------------- /images/rose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/rose.png -------------------------------------------------------------------------------- /images/flower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/flower.png -------------------------------------------------------------------------------- /images/img2img_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/img2img_1.png -------------------------------------------------------------------------------- /images/mask_rose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/mask_rose.png -------------------------------------------------------------------------------- /images/inpaint_rose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/inpaint_rose.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.1 2 | diffusers==0.3.0 3 | einops==0.4.1 4 | jsonmerge==1.8.0 5 | numpy==1.22.4 6 | opencv-python==4.6.0.66 7 | pandas==1.4.2 8 | pytorch-lightning==1.7.4 9 | scikit-image==0.19.3 10 | timm==0.6.7 11 | torchdiffeq==0.2.3 12 | transformers==4.21.2 13 | Pillow==9.0.1 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Setup for torch, torchvision 2 | 3 | import subprocess 4 | import time 5 | 6 | 7 | print("Setting up environment...") 8 | start_time = time.time() 9 | 10 | all_process = [ 11 | ['pip', 'install', 'torch==1.12.1+cu116', 'torchvision==0.13.1+cu116', '--extra-index-url', 'https://download.pytorch.org/whl/cu116'], 12 | ] 13 | 14 | for process in all_process: 15 | running = subprocess.run(process,stdout=subprocess.PIPE).stdout.decode('utf-8') 16 | 17 | end_time = time.time() 18 | print(f"Environment set up in {end_time-start_time:.0f} seconds") -------------------------------------------------------------------------------- /animation_mode/config.py: -------------------------------------------------------------------------------- 1 | FPS=30 2 | width=512 3 | height=512 4 | max_frames=300 5 | seed=1185529623 6 | seed_behavior="iter" 7 | animation_mode="3D" 8 | 9 | guidance_scale=2.5 10 | num_inference_steps=50 11 | diffusion_cadence=3 12 | border="wrap" 13 | angle="0:(0)" 14 | zoom="0:(0.0)" 15 | translation_x="0:(0)" 16 | translation_y="0:(0.0)" 17 | translation_z="0:(-2.5)" 18 | rotation_3d_x="0:(0)" 19 | rotation_3d_y="0:(0)" 20 | rotation_3d_z="0:(0)" 21 | strength_schedule="0:(0.55)" 22 | color_coherence="Match Frame 0 RGB" 23 | smooth='None' 24 | use_depth_warping=True 25 | midas_weight=0.3 26 | near_plane=200 27 | far_plane=10000 28 | fov=40 29 | padding_mode="border" 30 | sampling_mode="bicubic" 31 | save_depth_maps=False 32 | 33 | video_init_path="" 34 | video_same_size=True 35 | extract_nth_frame=1 36 | 37 | animation_prompts={ 38 | "0":"detailed scroll painting of plants, trees and ocean by hokusai, 8k, sharp!!!",} 39 | -------------------------------------------------------------------------------- /animation_mode/setup.py: -------------------------------------------------------------------------------- 1 | # Setup for animate mode 2 | 3 | import subprocess 4 | import time 5 | 6 | 7 | print("Setting up environment...") 8 | start_time = time.time() 9 | 10 | all_process = [ 11 | ['git', 'clone', 'https://github.com/deforum/stable-diffusion'], 12 | ['git', 'clone', 'https://github.com/shariqfarooq123/AdaBins.git'], 13 | ['git', 'clone', 'https://github.com/isl-org/MiDaS.git'], 14 | ['git', 'clone', 'https://github.com/MSFTserver/pytorch3d-lite.git'], 15 | ['pip', 'install', '-e', 'git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers'], 16 | ['pip', 'install', '-e', 'git+https://github.com/openai/CLIP.git@main#egg=clip'], 17 | ] 18 | 19 | for process in all_process: 20 | running = subprocess.run(process,stdout=subprocess.PIPE).stdout.decode('utf-8') 21 | 22 | print(subprocess.run(['git', 'clone', 'https://github.com/deforum/k-diffusion/'], stdout=subprocess.PIPE).stdout.decode('utf-8')) 23 | with open('k-diffusion/k_diffusion/__init__.py', 'w') as f: 24 | f.write('') 25 | 26 | end_time = time.time() 27 | print(f"Environment set up in {end_time-start_time:.0f} seconds") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Logeswaran Sivakumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import torch 5 | 6 | FPS = 24 7 | 8 | def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): 9 | """ Helper function to spherically interpolate two arrays """ 10 | 11 | if not isinstance(v0, np.ndarray): 12 | inputs_are_torch = True 13 | input_device = v0.device 14 | v0 = v0.cpu().numpy() 15 | v1 = v1.cpu().numpy() 16 | 17 | dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1))) 18 | if np.abs(dot) > DOT_THRESHOLD: 19 | v2 = (1 - t) * v0 + t * v1 20 | else: 21 | theta_0 = np.arccos(dot) 22 | sin_theta_0 = np.sin(theta_0) 23 | theta_t = theta_0 * t 24 | sin_theta_t = np.sin(theta_t) 25 | s0 = np.sin(theta_0 - theta_t) / sin_theta_0 26 | s1 = sin_theta_t / sin_theta_0 27 | v2 = s0 * v0 + s1 * v1 28 | 29 | if inputs_are_torch: 30 | v2 = torch.from_numpy(v2).to(input_device) 31 | 32 | return v2 33 | 34 | def save_images(images): 35 | """ Helper function to save images """ 36 | save_dir = os.path.join(os.getcwd(), r'images') 37 | if not os.path.exists(save_dir): 38 | os.makedirs(save_dir) 39 | for i, image in enumerate(images): 40 | image.save("images/image_" + str(i) + ".png") 41 | 42 | def save_video(images, width, height): 43 | """ Helper function to create and save video """ 44 | save_dir = os.path.join(os.getcwd(), r'images') 45 | if not os.path.exists(save_dir): 46 | os.makedirs(save_dir) 47 | 48 | out = cv2.VideoWriter("images/output.avi", # video file name 49 | cv2.VideoWriter_fourcc(*'MJPG'), # fourcc format 50 | FPS, # video fps 51 | (width, height) # (frame width, frame height) 52 | ) 53 | for _, pil_image in enumerate(images): 54 | out.write(cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)) 55 | out.release() 56 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from utils.pipeline import StableDiffusionPipe 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-n', "--num", required=False, default=1, 8 | help="Number of images to generate", type=int) 9 | parser.add_argument('-l', "--local", required=False, action='store_true', default=False, 10 | help="local model or download from huggingface") 11 | parser.add_argument('-s', "--save", required=False, action='store_true', default=False, 12 | help="Save generated image") 13 | parser.add_argument('-d', "--device", required=False, default="gpu", choices=["cpu", "gpu"], 14 | help="cpu or gpu device", type=str) 15 | parser.add_argument('-m', "--mode", required=True, default="txt2img", 16 | choices=["txt2img", "img2img", "inpaint", "dream", "animate"], 17 | help="Select the mode", type=str) 18 | parser.add_argument('-limit', "--limit", required=False, action='store_true', default=True, 19 | help="Limited memory usage") 20 | 21 | args = parser.parse_args() 22 | num_images = args.num 23 | is_local_model = args.local 24 | save = args.save 25 | device = args.device 26 | mode = args.mode 27 | limit = args.limit 28 | 29 | pipe = StableDiffusionPipe(is_local_model, device) 30 | 31 | if mode.lower() == "txt2img": 32 | pipe.TexttoImage(num_images, save, limit) 33 | elif mode.lower() == "img2img": 34 | pipe.ImagetoImage(num_images, save, limit) 35 | elif mode.lower() == "inpaint": 36 | pipe.Inpaint(num_images, save, limit) 37 | elif mode.lower() == "dream": 38 | pipe.Dream(num_images, save) 39 | elif mode.lower() == "animate": 40 | pipe.Animate(save) 41 | else: 42 | print(f"\n {mode} is an invalide mode. Select a valid mode.") 43 | -------------------------------------------------------------------------------- /animation_mode/README.md: -------------------------------------------------------------------------------- 1 | ## Animate mode 2 | Animate mode can generate "2D" or "3D" videos from input prompts. Also, it can perform Video-to-Video conversion of a "Video Input" based on input prompts. 3 | 4 | ### Run command 5 | Clone the repo, and run the cmds from Stable-Diffusion-Playground dir. 6 | ```python 7 | pip install -r requirements.txt 8 | python setup.py 9 | mkdir models 10 | mkdir pretrained 11 | cd animation_mode 12 | python setup.py 13 | cd .. 14 | ``` 15 | ```python 16 | python run.py --mode animate --save 17 | ``` 18 |
19 | Animate mode uses configurations specified in ./animation_mode/config.py. Specify the configurations for video generation in this file. 20 | 21 | ### Configurations 22 | | Argument | Description | Choices | 23 | | ---------------- |:-------------------------------:|:-------------------------:| 24 | | FPS | Frame rate of the output video | Integer number | 25 | | width | width of the frame | Integer number | 26 | | height | height of the frame | Integer number | 27 | | max_frames | Number of frames in the video | Integer number | 28 | | seed | Seed value for frame generation | Integer number | 29 | | seed_behavior | Seed mode | "iter", "fixed" | 30 | | animation_mode | Mode of animation | "2D", "3D", "Video Input" | 31 | | guidance_scale | Indicates how much output should be linked to prompt | Float number.
Allowed: guidance_scale > 1. | 32 | | num_inference_steps | Number of denoising steps | Integer number | 33 | | diffusion_cadence | number of frames to generate between frames | Integer number.
Allowed: > 1 for "2D", "3D" animation_mode | 34 | | border | Border mode used in image transformation | "wrap", "replicate" | 35 | | angle | Angle of rotation in degrees | String. Format: "frame_id:(value)".
frame_id - integer, value - integer | 36 | | zoom | Amount of zoom | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 37 | | translation_x | Amount translation along X-axis | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 38 | | translation_y | Amount translation along Y-axis | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 39 | | translation_z | Amount translation along Z-axis | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 40 | | rotation_3d_x | Amount rotation parallel to X-axis.
Used only for "3D" animation_mode | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 41 | | rotation_3d_y | Amount rotation parallel to Y-axis.
Used only for "3D" animation_mode | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 42 | | rotation_3d_z | Amount rotation parallel to Z-axis.
Used only for "3D" animation_mode | String. Format: "frame_id:(value)".
frame_id - integer, value - float | 43 | | strength_schedule | Indicates how much to transform the current frame from previous frame | String. Format: "frame_id:(value)".
frame_id - integer, value - float.
Allowed: values in range (0, 1] | 44 | | color_coherence | Match the color of generated frames to first frame | "None", "Match Frame 0 RGB", "Match Frame 0 HSV", "Match Frame 0 LAB" | 45 | | smooth | Smoothen image | "None", "Smooth", "Smooth_more" | 46 | | use_depth_warping | Warp image by depth prediction.
Used only for "3D" animation_mode | Bool.
Allowed: True, False | 47 | | midas_weight | If <1.0, loads AdaBins model, else loads midas model.
Used only for "3D" animation_mode | Float number.
Allowed: midas_weight > 0.0 | 48 | | near_plane | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.
Used only for "3D" animation_mode | Integer number | 49 | | far_plane | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.
Used only for "3D" animation_mode | Integer number | 50 | | fov | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.
Used only for "3D" animation_mode | Integer number | 51 | | padding_mode | Padding mode in image transformation.
Used in torch.nn.functional.grid_sample.
Used only for "3D" animation_mode | "zeros", "border", "reflection" | 52 | | sampling_mode | Sampling mode in image transformation.
Used in torch.nn.functional.grid_sample.
Used only for "3D" animation_mode | "bilinear", "nearest", "bicubic" | 53 | | save_depth_maps | Save the predicted depth maps | Bool.
Allowed: True, False | 54 | | video_init_path | Path to video file.
Used only for "Video Input" animation_mode | String | 55 | | video_same_size | Indicates if output video should be same size as input video.
Used only for "Video Input" animation_mode | Bool.
Allowed: True, False | 56 | | extract_nth_frame | Extract every nth frame from video.
Used only for "Video Input" animation_mode | Integer number | 57 | | animation_prompts | Dictionary with key as frame id and value as prompt | Format: {"frame_id":promt}.
frame_id - Integer number, promt - String | 58 | 59 |
60 | Note:
61 | angle, zoom, translation_x, translation_y, translation_z, rotation_3d_x, rotation_3d_y, rotation_3d_z, strength_schedule can take a series of values.
62 | It should be in format: "frame_id:(value),frame_id:(value),..."

63 | For example, angle="0:(0),10:(30),20:(-30)". This means that starting from frame 0 till frame 9, the frames will have no angle change. Then, from frame 10 till frame 19, the frames will rotate clock-wise by 30 degrees. Then, from frame 20 till end of video or max_frames, the frames will rotate anti-clockwise by 30 degrees. 64 | 65 | --- 66 | 67 | animation_prompts can take a series of prompts.
68 | It should be in format: {"frame_id":prompt, "frame_id":prompt, ...}

69 | For example, animation_prompts = {"0":"White clouds in blue sky, realistic, 8k!!!", "100":"Aeroplane in blue sky, realistic!!"}. This means from frame 0 till frame 99, the generated frames will be based on prompt "White clouds in blue sky, realistic, 8k!!!". Then, from frame 100 till end of video or max_frames, the frames will be based on prompt "Aeroplane in blue sky, realistic!!". 70 | 71 | --- 72 | -------------------------------------------------------------------------------- /animation_mode/utility/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import requests 5 | 6 | import sys 7 | import cv2 8 | from einops import rearrange 9 | import numpy as np 10 | import pandas as pd 11 | from skimage.exposure import match_histograms 12 | import torch 13 | from PIL import Image, ImageFilter 14 | 15 | sys.path.extend([ 16 | './animation_mode/pytorch3d-lite', 17 | ]) 18 | 19 | from ..config import * 20 | import py3d_tools as p3d 21 | 22 | 23 | def add_noise(sample: torch.Tensor, noise_amt: float) -> torch.Tensor: 24 | return sample + torch.randn(sample.shape, device=sample.device) * noise_amt 25 | 26 | def anim_frame_warp_2d(prev_img_cv2, 27 | W, 28 | H, 29 | angle_series, 30 | zoom_series, 31 | translation_x_series, 32 | translation_y_series, 33 | frame_idx): 34 | angle = angle_series[frame_idx] 35 | zoom = zoom_series[frame_idx] 36 | translation_x = translation_x_series[frame_idx] 37 | translation_y = translation_y_series[frame_idx] 38 | 39 | center = (W // 2, H // 2) 40 | trans_mat = np.float32([[1, 0, translation_x], [0, 1, translation_y]]) 41 | rot_mat = cv2.getRotationMatrix2D(center, angle, zoom) 42 | trans_mat = np.vstack([trans_mat, [0, 0, 1]]) 43 | rot_mat = np.vstack([rot_mat, [0, 0, 1]]) 44 | xform = np.matmul(rot_mat, trans_mat) 45 | 46 | return cv2.warpPerspective( 47 | prev_img_cv2, 48 | xform, 49 | (prev_img_cv2.shape[1], prev_img_cv2.shape[0]), 50 | borderMode=cv2.BORDER_WRAP if border == 'wrap' else cv2.BORDER_REPLICATE 51 | ) 52 | 53 | def anim_frame_warp_3d(prev_img_cv2, 54 | depth, 55 | translation_x_series, 56 | translation_y_series, 57 | translation_z_series, 58 | rotation_3d_x_series, 59 | rotation_3d_y_series, 60 | rotation_3d_z_series, 61 | near_plane, 62 | far_plane, 63 | fov, 64 | sampling_mode, 65 | padding_mode, 66 | frame_idx): 67 | device = "cuda" 68 | TRANSLATION_SCALE = 1.0/200.0 # matches Disco 69 | translate_xyz = [ 70 | -translation_x_series[frame_idx] * TRANSLATION_SCALE, 71 | translation_y_series[frame_idx] * TRANSLATION_SCALE, 72 | -translation_z_series[frame_idx] * TRANSLATION_SCALE 73 | ] 74 | rotate_xyz = [ 75 | math.radians(rotation_3d_x_series[frame_idx]), 76 | math.radians(rotation_3d_y_series[frame_idx]), 77 | math.radians(rotation_3d_z_series[frame_idx]) 78 | ] 79 | rot_mat = p3d.euler_angles_to_matrix(torch.tensor(rotate_xyz, device=device), "XYZ").unsqueeze(0) 80 | result = transform_image_3d(prev_img_cv2, depth, rot_mat, translate_xyz, \ 81 | near_plane, far_plane, fov, sampling_mode, padding_mode) 82 | torch.cuda.empty_cache() 83 | return result 84 | 85 | def get_inbetweens(key_frames, integer=False, interp_method='Linear'): 86 | key_frame_series = pd.Series([np.nan for a in range(max_frames)]) 87 | 88 | for i, value in key_frames.items(): 89 | key_frame_series[i] = value 90 | key_frame_series = key_frame_series.astype(float) 91 | 92 | if interp_method == 'Cubic' and len(key_frames.items()) <= 3: 93 | interp_method = 'Quadratic' 94 | if interp_method == 'Quadratic' and len(key_frames.items()) <= 2: 95 | interp_method = 'Linear' 96 | 97 | key_frame_series[0] = key_frame_series[key_frame_series.first_valid_index()] 98 | key_frame_series[max_frames-1] = key_frame_series[key_frame_series.last_valid_index()] 99 | key_frame_series = key_frame_series.interpolate(method=interp_method.lower(), limit_direction='both') 100 | if integer: 101 | return key_frame_series.astype(int) 102 | return key_frame_series 103 | 104 | def load_img(path, shape): 105 | if path.startswith('http://') or path.startswith('https://'): 106 | image = Image.open(requests.get(path, stream=True).raw) 107 | else: 108 | image = Image.open(path) 109 | 110 | image = image.resize(shape, resample=Image.LANCZOS) 111 | 112 | return image 113 | 114 | def maintain_colors(prev_img, color_match_sample, mode): 115 | if mode == 'Match Frame 0 RGB': 116 | return match_histograms(prev_img, color_match_sample, multichannel=True) 117 | elif mode == 'Match Frame 0 HSV': 118 | prev_img_hsv = cv2.cvtColor(prev_img, cv2.COLOR_RGB2HSV) 119 | color_match_hsv = cv2.cvtColor(color_match_sample, cv2.COLOR_RGB2HSV) 120 | matched_hsv = match_histograms(prev_img_hsv, color_match_hsv, multichannel=True) 121 | return cv2.cvtColor(matched_hsv, cv2.COLOR_HSV2RGB) 122 | else: # 'Match Frame 0 LAB' 123 | prev_img_lab = cv2.cvtColor(prev_img, cv2.COLOR_RGB2LAB) 124 | color_match_lab = cv2.cvtColor(color_match_sample, cv2.COLOR_RGB2LAB) 125 | matched_lab = match_histograms(prev_img_lab, color_match_lab, multichannel=True) 126 | return cv2.cvtColor(matched_lab, cv2.COLOR_LAB2RGB) 127 | 128 | def next_seed(seed, seed_behavior): 129 | if seed_behavior == 'iter': 130 | seed += 1 131 | elif seed_behavior == 'fixed': 132 | pass # always keep seed the same 133 | else: 134 | seed = random.randint(0, 2**32) 135 | return seed 136 | 137 | def parse_key_frames(string, prompt_parser=None): 138 | import re 139 | pattern = r'((?P[0-9]+):[\s]*[\(](?P[\S\s]*?)[\)])' 140 | frames = dict() 141 | for match_object in re.finditer(pattern, string): 142 | frame = int(match_object.groupdict()['frame']) 143 | param = match_object.groupdict()['param'] 144 | if prompt_parser: 145 | frames[frame] = prompt_parser(param) 146 | else: 147 | frames[frame] = param 148 | if frames == {} and len(string) != 0: 149 | raise RuntimeError('Key Frame string not correctly formatted') 150 | return frames 151 | 152 | def sample_from_cv2(sample: np.ndarray) -> torch.Tensor: 153 | sample = ((sample.astype(float) / 255.0) * 2) - 1 154 | sample = sample[None].transpose(0, 3, 1, 2).astype(np.float16) 155 | sample = torch.from_numpy(sample) 156 | return sample 157 | 158 | def sample_to_cv2(sample: torch.Tensor, type=np.uint8) -> np.ndarray: 159 | sample_f32 = rearrange(sample.squeeze().cpu().numpy(), "c h w -> h w c").astype(np.float32) 160 | sample_int8 = (sample_f32 * 255) 161 | return sample_int8.astype(type) 162 | 163 | def save_video(width, height): 164 | """ Helper function to create and save video """ 165 | frames_dir = os.path.join(os.getcwd(), r'images') 166 | if not os.path.exists(frames_dir): 167 | print(f"\nNo generated {frames_dir} dir found.") 168 | return 169 | 170 | save_dir = os.path.join(os.getcwd(), r'out_video') 171 | if not os.path.exists(save_dir): 172 | os.makedirs(save_dir) 173 | 174 | print("\nCreating video from generated frames...") 175 | out = cv2.VideoWriter("out_video/output.avi", # video file name 176 | cv2.VideoWriter_fourcc(*'MJPG'), # fourcc format 177 | FPS, # video fps 178 | (width, height) # (frame width, frame height) 179 | ) 180 | for count in range(0, max_frames): 181 | filename = "frame_" + str(count) + ".png" 182 | try: 183 | out.write(cv2.imread(os.path.join(frames_dir, filename))) 184 | except: 185 | pass 186 | out.release() 187 | print(f"\nVideo saved in {os.path.join(save_dir, 'out_video.avi')}") 188 | 189 | def smoothen_image(image, mode): 190 | if mode == 'Smooth': 191 | return image.filter(ImageFilter.SMOOTH) 192 | else: # 'SMOOTH_MORE' 193 | return image.filter(ImageFilter.SMOOTH_MORE) 194 | 195 | def transform_image_3d(prev_img_cv2, 196 | depth_tensor, 197 | rot_mat, 198 | translate, 199 | near_plane, 200 | far_plane, 201 | fov, 202 | sampling_mode, 203 | padding_mode): 204 | # adapted and optimized version of transform_image_3d 205 | # from Disco Diffusion https://github.com/alembics/disco-diffusion 206 | device = "cuda" 207 | w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0] 208 | 209 | aspect_ratio = float(w) / float(h) 210 | near, far, fov_deg = near_plane, far_plane, fov 211 | persp_cam_old = p3d.FoVPerspectiveCameras(near, far, aspect_ratio, fov=fov_deg, degrees=True, device=device) 212 | persp_cam_new = p3d.FoVPerspectiveCameras(near, far, aspect_ratio, fov=fov_deg, degrees=True, \ 213 | R=rot_mat, T=torch.tensor([translate]), device=device) 214 | 215 | # range of [-1,1] is important to torch grid_sample's padding handling 216 | y, x = torch.meshgrid(torch.linspace(-1., 1. , h, dtype=torch.float32, device=device), \ 217 | torch.linspace(-1., 1., w, dtype=torch.float32, device=device)) 218 | z = torch.as_tensor(depth_tensor, dtype=torch.float32, device=device) 219 | xyz_old_world = torch.stack((x.flatten(), y.flatten(), z.flatten()), dim=1) 220 | 221 | xyz_old_cam_xy = persp_cam_old.get_full_projection_transform().transform_points(xyz_old_world)[:,0:2] 222 | xyz_new_cam_xy = persp_cam_new.get_full_projection_transform().transform_points(xyz_old_world)[:,0:2] 223 | 224 | offset_xy = xyz_new_cam_xy - xyz_old_cam_xy 225 | # affine_grid theta param expects a batch of 2D mats. Each is 2x3 to do rotation+translation. 226 | identity_2d_batch = torch.tensor([[1.,0.,0.], [0.,1.,0.]], device=device).unsqueeze(0) 227 | # coords_2d will have shape (N,H,W,2).. which is also what grid_sample needs. 228 | coords_2d = torch.nn.functional.affine_grid(identity_2d_batch, [1, 1, h, w], align_corners=False) 229 | offset_coords_2d = coords_2d - torch.reshape(offset_xy, (h,w,2)).unsqueeze(0) 230 | 231 | image_tensor = rearrange(torch.from_numpy(prev_img_cv2.astype(np.float32)), 'h w c -> c h w').to(device) 232 | new_image = torch.nn.functional.grid_sample( 233 | image_tensor.add(1/512 - 0.0001).unsqueeze(0), 234 | offset_coords_2d, 235 | mode=sampling_mode, 236 | padding_mode=padding_mode, 237 | align_corners=False 238 | ) 239 | 240 | # convert back to cv2 style numpy array 241 | result = rearrange( 242 | new_image.squeeze().clamp(0,255), 243 | 'c h w -> h w c' 244 | ).cpu().numpy().astype(prev_img_cv2.dtype) 245 | return result 246 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⛹️‍♀️:basketball: Stable-Diffusion-Playground :soccer:⛹️ 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/LICENSE) 3 | 4 | An application that generates images or videos using Stable Diffusion models. 5 | 6 | ## Description :scroll: 7 | What is the term "diffusion"?
8 | 9 | From Wikipedia, "Diffusion is the net movement of anything (for example, atoms, ions, molecules, energy) generally from a region of higher concentration to a region of lower concentration."
10 | 11 | Similar to the definition, diffusion models apply noise to an image sequentially across multiple steps in forward pass. This essentially diffuses the pixels. In the backward pass, the noisy image is denoised across same steps. Since it is a sequential process, there is less chance of mode collapse (a problem with GANs) to occur.
12 | 13 | Most diffusion models use UNet architecture to preserve the dimensionality of the image. Usually, diffusion models apply diffusion in pixel space, but stable diffusion models apply diffusion in latent space. Hence, the term "Latent diffusion model (LDM)". The conversion between pixel space to latent space is done using Encoder and Decoder. This method is memory efficient compared to previous methods, and also produces highly detailed image.
14 | 15 | Read through the [paper](https://arxiv.org/abs/2112.10752) for more details. Big-ups to the researchers/creators for the work and for open-sourcing it.
16 | 17 | ## General Requirements :mage_man: 18 | * Atleast 6GB of VRAM is required to generate a single 512x512 image. 19 | * For better image generation, use descriptive and detailed prompt. 20 | 21 | ## Code Requirements :mage_woman: 22 | Use Python 3.8.13. Setup conda environment, git clone repo and run the below commands, 23 | ```python 24 | pip install -r requirements.txt 25 | python setup.py 26 | mkdir models 27 | mkdir pretrained 28 | cd animation_mode 29 | python setup.py 30 | cd .. 31 | ``` 32 | 33 | ## How to run :running_man: 34 | 35 | Command line arguments: 36 | | Argument | Requirement | Default | Choices | Description | 37 | | ---------------- |:-------------:|:-------:|:-----------------------------:| :------------| 38 | | --mode / -m | True | - | "txt2img", "img2img", "inpaint", "dream", "animate" | Mode of application. | 39 | | --local / -l | False | False | True / False | If argument is provided, use local model files. Else download from hugging face. | 40 | | --device / -d | False | "cpu" | "cpu", "gpu" | Run on target device. | 41 | | --num / -n | False | 1 | integer number | Number of images to generate. | 42 | | --save / -s | False | False | True / False | If argument is provided, save generated images. | 43 | | --limit / -limit | False | True | True / False | If argument is provided, limit memory usage. | 44 | 45 | There are five different modes of running the application,
46 | * Text to Image (txt2img) 47 | * Image to Image (img2img) 48 | * Inpaint (inpaint) 49 | * Dream (dream) 50 | * Animate (animate) - sub-modes: 2D, 3D, Video Input 51 | 52 | Mode: Text to Image
53 | ```python 54 | python run.py --mode txt2img --device gpu --save 55 | ``` 56 | 57 | Mode: Image to Image
58 | ```python 59 | python run.py --mode img2img --device gpu --save 60 | ``` 61 | 62 | Mode: Inpaint
63 | ```python 64 | python run.py --mode inpaint --device gpu --save 65 | ``` 66 | 67 | Mode: Dream
68 | ```python 69 | python run.py --mode dream --device gpu --save --num 70 | ``` 71 | 72 | Mode: Animate
73 | ```python 74 | python run.py --mode animate --device gpu --save 75 | ``` 76 | Note:
77 | * For each of the modes, run the command and follow the cli to provide hugging face user token, prompt and size (Height, Width) of image.
78 | * Generated images or video will be saved to $PWD/images dir. For animate mode, video will be saved to $PWD/out_video dir. 79 | * Single 512x512 image generation takes ~12 seconds on NVIDIA GeForce RTX 3060 with 6GB VRAM. 80 | * Dream mode will generate --num image frames based on input prompt, and create a video.
81 | * Image to Image mode will generate new image from initial image and input prompt. Inpaint mode will generate the masked part of image from initial image, mask image and input prompt. The strength input in CLI will indicate the amount of change from initial image. In range [0, 1]; with 0 indicating no change and 1 indicating complete change from original image. 82 | 83 | Hugging face Access Token:
84 | * Create an account in [huggingface.co](https://huggingface.co/). Go to Settings -> Access Tokens. Create an access token with read permission.
85 | 86 | ### How to use Animate mode :paintbrush: 87 | This implemetation is an optimized version of [DeforumStableDiffusionLocal](https://github.com/HelixNGC7293/DeforumStableDiffusionLocal) and [Deforum_Stable_Diffusion.ipynb](https://colab.research.google.com/github/deforum/stable-diffusion/blob/main/Deforum_Stable_Diffusion.ipynb). Thanks for their work.

88 | Animate mode is quite different from the other modes of the app. Animate mode can generate "2D" or "3D" videos from input prompts. Also, it can perform Video-to-Video conversion of a "Video Input" based on input prompts.
89 | 90 | To use this mode, follow the below steps,
91 | 92 | #### Requirements 93 | Clone the repo, and run the following cmds, 94 | ```python 95 | pip install -r requirements.txt 96 | python setup.py 97 | mkdir models 98 | mkdir pretrained 99 | cd animation_mode 100 | python setup.py 101 | cd .. 102 | ``` 103 | 104 | Next, manually download the models, 105 | * Download [dpt_large-midas-2f21e586.pt](https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt) and place it in ./models dir. 106 | * Download [AdaBins_nyu.pt](https://cloudflare-ipfs.com/ipfs/Qmd2mMnDLWePKmgfS8m6ntAg4nhV5VkUyAydYBp8cWWeB7/AdaBins_nyu.pt) and place it in ./pretrained dir. 107 | 108 | Animate mode uses configurations specified in ./animation_mode/config.py. Specify the configurations for video generation in this file. Refer [animation_mode/README.md](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/animation_mode/README.md) for details on parameters usage in config.py. 109 | 110 | #### Run command 111 | ```python 112 | python run.py --mode animate --save 113 | ``` 114 | Generated video will be saved to ./out_video dir. 115 | 116 | ## Results :bar_chart: 117 |

:star: Text to Image :star:

118 | 119 | ```python 120 | python run.py --mode txt2img --device gpu --num 1 --limit --save 121 | ``` 122 | 123 | ||| 124 | |:-------------------------:|:-------------------------:| 125 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/49.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/3.png)| 126 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/78.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/93.png)| 127 | --- 128 |

:star: Image to Image :star:

129 | 130 | ```python 131 | python run.py --mode img2img --device gpu --num 1 --limit --save 132 | ``` 133 | CLI inputs:
134 | ```python 135 | Enter Hugging face user access token: 136 | 137 | Loading model... 138 | 139 | Model loaded successfully 140 | 141 | Enter initial image path: flower.png 142 | 143 | Enter prompt: beautiful red flower, vibrant, realistic, smooth, bokeh, highly detailed, 4k 144 | 145 | Enter strength in [0, 1] range: 0.8 146 | 147 | Running Image to Image generation... 148 | ``` 149 | ||| 150 | |:-------------------------:|:-------------------------:| 151 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/flower.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/img2img_1.png)| 152 | 153 | --- 154 |

:star: Inpaint :star:

155 | 156 | ```python 157 | python run.py --mode inpaint --device gpu --num 1 --limit --save 158 | ``` 159 | CLI inputs:
160 | ```python 161 | Enter Hugging face user access token: 162 | 163 | Loading model... 164 | 165 | Model loaded successfully 166 | 167 | Enter initial image path: rose.png 168 | 169 | Enter mask image path: mask_rose.png 170 | 171 | Enter prompt: beautiful blue butterfly on a rose, glossy, detailed, sharp, 4k 172 | 173 | Enter strength in [0, 1] range: 0.8 174 | 175 | Running Inpaint... 176 | ``` 177 | 178 | | Initial image | Mask | Inpainted image | 179 | |:-------------------------:|:-------------------------:|:-------------------------:| 180 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/rose.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/mask_rose.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/inpaint_rose.png)| 181 | 182 | --- 183 |

:star: Dream :star:

184 | 185 | ```python 186 | python run.py --mode dream --device gpu --num 780 --limit --save 187 | ``` 188 | CLI inputs:
189 | ```python 190 | Enter Hugging face user access token: 191 | 192 | Loading model... 193 | 194 | Model loaded successfully 195 | 196 | Enter prompt: highly detailed bowl of lucrative ramen, stephen bliss, unreal engine, fantasy art by greg rutkowski, loish, rhads and lois van baarle, ilya kuvshinov, rossdraws, tom bagshaw, alphonse mucha, global illumination, detailed and intricate environment 197 | 198 | Enter height and width of image: 512 512 199 | 200 | Dreaming... 201 | ``` 202 | 203 | https://user-images.githubusercontent.com/36563521/192521369-32673804-009f-44c6-918c-a7746cc94dba.mp4 204 | 205 | --- 206 |

:star: Animate :star:

207 | 208 | |2D|3D| 209 | |:-------------------------:|:-------------------------:| 210 | | **TODO** | ![boat_in_storm](https://user-images.githubusercontent.com/36563521/194770440-db663425-282c-4aba-8b1a-2fb8db8bd6d0.gif) | 211 | 212 | --- 213 | 214 | ## References :page_facing_up: 215 | * [stability.ai](https://stability.ai/blog/stable-diffusion-public-release) blog. 216 | * LDM [paper](https://arxiv.org/abs/2112.10752). 217 | * LDM [repo](https://github.com/CompVis/latent-diffusion). 218 | * [Hugging face diffuser](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion) for API usage. 219 | * [Gist](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355) by Andrej Karpathy. 220 | * [lexica.art](https://lexica.art/) for cool prompts. 221 | 222 | Happy Learning! 😄 223 | -------------------------------------------------------------------------------- /utils/pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import torch 4 | from torch import autocast 5 | from diffusers.schedulers import LMSDiscreteScheduler 6 | from diffusers import StableDiffusionPipeline, \ 7 | StableDiffusionImg2ImgPipeline, \ 8 | StableDiffusionInpaintPipeline 9 | 10 | from .utility import save_images, save_video, slerp 11 | from animation_mode.animation import animate 12 | 13 | class StableDiffusionPipe(): 14 | """ Pipline for Stable Diffusion model applications """ 15 | def __init__(self, use_local_model: bool = True, device: str = "cpu") -> None: 16 | self.use_local_model = use_local_model 17 | self.device = device if device == "cpu" else "cuda" 18 | 19 | def TexttoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 20 | """ Text to Image function """ 21 | 22 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 23 | local_files_only = self.use_local_model 24 | use_auth_token = not local_files_only 25 | 26 | # Get access token 27 | access_token = False 28 | if use_auth_token: 29 | access_token = input("\nEnter Hugging face user access token: ") 30 | 31 | # Load the model 32 | print("\nLoading model...") 33 | pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token, 34 | local_files_only=local_files_only, 35 | torch_dtype=torch.float16, revision='fp16') 36 | pipe = pipe.to(self.device) 37 | print("\nModel loaded successfully") 38 | 39 | # Get prompt 40 | prompt = input("\nEnter prompt: ") 41 | height, width = input("\nEnter height and width of image: ").split() 42 | height = int(height) 43 | width = int(width) 44 | 45 | # Convert height and width to multiple of 64 for model. 46 | height = height - height % 64 47 | width = width - width % 64 48 | 49 | # Generate images 50 | images = [] 51 | if use_limited_mem: 52 | prompts = [prompt] 53 | for _ in range (1, num_images + 1): 54 | print("\nRunning Text to Image generation...") 55 | with autocast(self.device): 56 | images.append(pipe(prompt=prompts, height=height, width=width).images[0]) 57 | else: 58 | print("\nRunning Text to Image generation...") 59 | prompts = [prompt] * num_images 60 | images = pipe(prompt=prompts, height=height, width=width).images 61 | 62 | # Save images 63 | if save: 64 | print("Saving images...") 65 | save_images(images) 66 | 67 | def ImagetoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 68 | """ Image to Image function """ 69 | 70 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 71 | local_files_only = self.use_local_model 72 | use_auth_token = not local_files_only 73 | 74 | # Get access token 75 | access_token = False 76 | if use_auth_token: 77 | access_token = input("\nEnter Hugging face user access token: ") 78 | 79 | # Load the model 80 | print("\nLoading model...") 81 | pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, use_auth_token=access_token, 82 | local_files_only=local_files_only, 83 | torch_dtype=torch.float16, revision='fp16') 84 | pipe = pipe.to(self.device) 85 | print("\nModel loaded successfully") 86 | 87 | # Get prompt 88 | image_path = input("\nEnter initial image path: ") 89 | prompt = input("\nEnter prompt: ") 90 | strength = float(input("\nEnter strength in [0, 1] range: ")) 91 | if not 0 <= strength <= 1: 92 | raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength)) 93 | 94 | init_image = Image.open(image_path).convert("RGB") 95 | width, height = init_image.size 96 | 97 | # Convert height and width to multiple of 64 for model. 98 | width = width - width % 64 99 | height = height - height % 64 100 | init_image = init_image.resize((width, height)) 101 | 102 | # Generate images 103 | images = [] 104 | if use_limited_mem: 105 | prompts = [prompt] 106 | for _ in range (1, num_images + 1): 107 | print("\nRunning Image to Image generation...") 108 | with autocast(self.device): 109 | images.append(pipe(prompt=prompts, 110 | init_image=init_image, 111 | strength=strength).images[0]) 112 | else: 113 | print("\nRunning Image to Image generation...") 114 | prompts = [prompt] * num_images 115 | images = pipe(prompt=prompts, 116 | init_image=init_image, 117 | strength=strength).images 118 | 119 | # Save images 120 | if save: 121 | print("Saving images...") 122 | save_images(images) 123 | 124 | def Inpaint(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 125 | """ Inpaint function """ 126 | 127 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 128 | local_files_only = self.use_local_model 129 | use_auth_token = not local_files_only 130 | 131 | # Get access token 132 | access_token = False 133 | if use_auth_token: 134 | access_token = input("\nEnter Hugging face user access token: ") 135 | 136 | # Load the model 137 | print("\nLoading model...") 138 | pipe = StableDiffusionInpaintPipeline.from_pretrained(path, use_auth_token=access_token, 139 | local_files_only=local_files_only, 140 | torch_dtype=torch.float16, revision='fp16') 141 | pipe = pipe.to(self.device) 142 | print("\nModel loaded successfully") 143 | 144 | # Get prompt 145 | image_path = input("\nEnter initial image path: ") 146 | mask_path = input("\nEnter mask image path: ") 147 | prompt = input("\nEnter prompt: ") 148 | strength = float(input("\nEnter strength in [0, 1] range: ")) 149 | if not 0 <= strength <= 1: 150 | raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength)) 151 | 152 | init_image = Image.open(image_path).convert("RGB") 153 | mask_image = Image.open(mask_path).convert("RGB") 154 | image_width, image_height = init_image.size 155 | mask_width, mask_height = mask_image.size 156 | 157 | if (not image_width == mask_width) or (not image_height == mask_height): 158 | raise ValueError("Init image size must match mask image size.") 159 | 160 | # Convert height and width to multiple of 64 for model. 161 | image_width = image_width - image_width % 64 162 | image_height = image_height - image_height % 64 163 | init_image = init_image.resize((image_width, image_height)) 164 | mask_image = mask_image.resize((image_width, image_height)) 165 | 166 | # Generate images 167 | images = [] 168 | if use_limited_mem: 169 | prompts = [prompt] 170 | for _ in range (1, num_images + 1): 171 | print("\nRunning Inpaint...") 172 | with autocast(self.device): 173 | images.append(pipe(prompt=prompts, 174 | init_image=init_image, 175 | mask_image=mask_image, 176 | strength=strength).images[0]) 177 | else: 178 | print("\nRunning Inpaint...") 179 | prompts = [prompt] * num_images 180 | images = pipe(prompt=prompts, 181 | init_image=init_image, 182 | mask_image=mask_image, 183 | strength=strength).images 184 | 185 | # Save images 186 | if save: 187 | print("Saving images...") 188 | save_images(images) 189 | 190 | def Dream(self, num_images: int = 1, save: bool = True): 191 | """ Dream function """ 192 | 193 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 194 | local_files_only = self.use_local_model 195 | use_auth_token = not local_files_only 196 | 197 | # Get access token 198 | access_token = False 199 | if use_auth_token: 200 | access_token = input("\nEnter Hugging face user access token: ") 201 | 202 | # Load the model 203 | print("\nLoading model...") 204 | lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") 205 | pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token, 206 | local_files_only=local_files_only, scheduler=lms, 207 | torch_dtype=torch.float16, revision='fp16') 208 | pipe = pipe.to(self.device) 209 | print("\nModel loaded successfully") 210 | 211 | # Get prompt 212 | prompt = input("\nEnter prompt: ") 213 | height, width = input("\nEnter height and width of image: ").split() 214 | height = int(height) 215 | width = int(width) 216 | 217 | # Convert height and width to multiple of 64 for model. 218 | height = height - height % 64 219 | width = width - width % 64 220 | 221 | source_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device) 222 | target_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device) 223 | 224 | images = [] 225 | print("\nDreaming...") 226 | for _, t in enumerate(np.linspace(0, 1, num_images)): 227 | init_latent = slerp(float(t), source_latent, target_latent) 228 | 229 | with autocast("cuda"): 230 | image = pipe(prompt, latents=init_latent).images[0] 231 | if not image.convert("L").getextrema() == (0, 0): # check for black image 232 | images.append(image) 233 | 234 | # Save images and video 235 | if save: 236 | print("Saving images...") 237 | save_images(images) 238 | print("Saving video...") 239 | save_video(images, width, height) 240 | 241 | def Animate(self, save: bool = True): 242 | """ Animate function """ 243 | print("\nUsing configurations from animation_mode/config.py") 244 | animate(self.use_local_model, save) 245 | -------------------------------------------------------------------------------- /animation_mode/animation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import sys 4 | from types import SimpleNamespace 5 | import cv2 6 | import pandas as pd 7 | import numpy as np 8 | from pytorch_lightning import seed_everything 9 | import torch 10 | from torch import autocast 11 | from torchvision import transforms 12 | from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline 13 | from PIL import Image 14 | 15 | sys.path.extend([ 16 | './animation_mode/src/taming-transformers', 17 | './animation_mode/src/clip', 18 | './animation_mode/stable-diffusion/', 19 | './animation_mode/k-diffusion', 20 | './animation_mode/AdaBins', 21 | './animation_mode/MiDaS', 22 | './animation_mode', 23 | ]) 24 | 25 | import config 26 | from .utility.utils import * 27 | from helpers import DepthModel 28 | 29 | 30 | def generate(pipe, 31 | prompt, 32 | height, 33 | width, 34 | strength, 35 | seed, 36 | use_init, 37 | init_image, 38 | return_sample=False): 39 | """ Image generator """ 40 | seed_everything(seed) 41 | device = "cuda" 42 | convert_tensor = transforms.ToTensor() 43 | 44 | results = [] 45 | if use_init: 46 | with autocast(device): 47 | with torch.no_grad(): 48 | image = pipe(prompt=prompt, 49 | init_image=init_image, 50 | strength=strength, 51 | guidance_scale=config.guidance_scale, 52 | num_inference_steps=config.num_inference_steps).images[0] 53 | torch.cuda.empty_cache() 54 | else: 55 | with autocast(device): 56 | with torch.no_grad(): 57 | image = pipe(prompt=prompt, height=height, width=width).images[0] 58 | torch.cuda.empty_cache() 59 | 60 | if return_sample: 61 | samples = convert_tensor(image) 62 | results.append(samples) 63 | results.append(image) 64 | 65 | return results 66 | 67 | 68 | def render_input_video(pipe_txt2img, pipe_img2img): 69 | """ Function for animate video """ 70 | # create a folder for the video input frames to live in 71 | video_in_frame_path = os.path.join(os.getcwd(), 'inputframes') 72 | os.makedirs(video_in_frame_path, exist_ok=True) 73 | 74 | # save the video frames from input video 75 | print(f"Exporting Video Frames from (1 every {config.extract_nth_frame}) \ 76 | frames to {video_in_frame_path}...") 77 | try: 78 | for f in pathlib.Path(video_in_frame_path).glob('*.png'): 79 | f.unlink() 80 | except: 81 | pass 82 | cap = cv2.VideoCapture(config.video_init_path) 83 | success, image = cap.read() 84 | count = 0 85 | while success: 86 | file_name = "inputframes/frame_" + str(count) + ".png" 87 | cv2.imwrite(file_name, image) 88 | success,image = cap.read() 89 | count = count + 1 + (config.extract_nth_frame - 1) 90 | if config.max_frames is not None and count > config.max_frames: 91 | break 92 | cap.release() 93 | 94 | # determine max frames from length of input frames 95 | num_frames = len([f for f in pathlib.Path(video_in_frame_path).glob('*.png')]) 96 | 97 | print(f"Loading {num_frames} input frames from {video_in_frame_path} \ 98 | and saving video frames to {video_in_frame_path}") 99 | render_animation(pipe_txt2img, pipe_img2img) 100 | 101 | 102 | def render_animation(pipe_txt2img, pipe_img2img): 103 | """ Function for animate 2D, animate 3D """ 104 | device = "cuda" 105 | W, H = (config.width, config.height) 106 | depth_model = None 107 | models_path = "./models" 108 | init_image = None 109 | video_width, video_height = None, None 110 | 111 | angle_series = get_inbetweens(parse_key_frames(config.angle)) 112 | zoom_series = get_inbetweens(parse_key_frames(config.zoom)) 113 | translation_x_series = get_inbetweens(parse_key_frames(config.translation_x)) 114 | translation_y_series = get_inbetweens(parse_key_frames(config.translation_y)) 115 | translation_z_series = get_inbetweens(parse_key_frames(config.translation_z)) 116 | rotation_3d_x_series = get_inbetweens(parse_key_frames(config.rotation_3d_x)) 117 | rotation_3d_y_series = get_inbetweens(parse_key_frames(config.rotation_3d_y)) 118 | rotation_3d_z_series = get_inbetweens(parse_key_frames(config.rotation_3d_z)) 119 | strength_schedule_series = get_inbetweens(parse_key_frames(config.strength_schedule)) 120 | midas_weight_dict = {"midas_weight":config.midas_weight} 121 | anim_args = SimpleNamespace(**midas_weight_dict) 122 | 123 | start_frame = 0 124 | outdir = os.path.join(os.getcwd(), r'images') 125 | if not os.path.exists(outdir): 126 | os.makedirs(outdir) 127 | print(f"\nSaving animation frames to {outdir}") 128 | 129 | # check for video inits 130 | using_vid_init = config.animation_mode == 'Video Input' 131 | use_init = using_vid_init 132 | 133 | max_frames = config.max_frames 134 | if using_vid_init: 135 | max_frames = len([f for f in pathlib.Path(os.path.join(os.getcwd(), 'inputframes')).glob('*.png')]) 136 | cap = cv2.VideoCapture(config.video_init_path) 137 | video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 138 | video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 139 | cap.release() 140 | 141 | # expand prompts out to per-frame 142 | prompt_series = pd.Series([np.nan for a in range(max_frames)]) 143 | for i, prompt in config.animation_prompts.items(): 144 | prompt_series[int(i)] = prompt 145 | prompt_series = prompt_series.ffill().bfill() 146 | 147 | # load depth model for 3D 148 | predict_depths = (config.animation_mode == '3D' and config.use_depth_warping) or config.save_depth_maps 149 | if predict_depths: 150 | depth_model = DepthModel("cpu") 151 | depth_model.load_midas(models_path) 152 | if config.midas_weight < 1.0: 153 | depth_model.load_adabins() 154 | else: 155 | depth_model = None 156 | config.save_depth_maps = False 157 | 158 | turbo_steps = 1 if using_vid_init else int(config.diffusion_cadence) 159 | turbo_prev_image, turbo_prev_frame_idx = None, 0 160 | turbo_next_image, turbo_next_frame_idx = None, 0 161 | 162 | # resume animation 163 | prev_sample = None 164 | color_match_sample = None 165 | frame_idx = start_frame 166 | 167 | seed = config.seed 168 | while frame_idx < max_frames: 169 | print(f"\nRendering animation frame {frame_idx} of {max_frames}") 170 | strength = strength_schedule_series[frame_idx] 171 | strength = max(0.0, min(1.0, strength)) 172 | depth = None 173 | 174 | # emit in-between frames 175 | if turbo_steps > 1: 176 | tween_frame_start_idx = max(0, frame_idx-turbo_steps) 177 | for tween_frame_idx in range(tween_frame_start_idx, frame_idx): 178 | tween = float(tween_frame_idx - tween_frame_start_idx + 1) / float(frame_idx - tween_frame_start_idx) 179 | print(f"creating in between frame {tween_frame_idx} tween:{tween:0.2f}") 180 | 181 | advance_prev = turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx 182 | advance_next = tween_frame_idx > turbo_next_frame_idx 183 | 184 | if depth_model is not None: 185 | assert turbo_next_image is not None 186 | depth_model.midas_model = depth_model.midas_model.to(device) 187 | depth_model.device = device 188 | with torch.no_grad(): 189 | depth = depth_model.predict(turbo_next_image, anim_args).cpu() 190 | torch.cuda.empty_cache() 191 | depth_model.midas_model = depth_model.midas_model.to("cpu") 192 | depth_model.device = "cpu" 193 | 194 | if config.animation_mode == '2D': 195 | if advance_prev: 196 | turbo_prev_image = anim_frame_warp_2d(turbo_prev_image, W, H, angle_series, zoom_series, \ 197 | translation_x_series, translation_y_series, tween_frame_idx) 198 | if advance_next: 199 | turbo_next_image = anim_frame_warp_2d(turbo_next_image, W, H, angle_series, zoom_series, \ 200 | translation_x_series, translation_y_series, tween_frame_idx) 201 | else: # '3D' 202 | if advance_prev: 203 | turbo_prev_image = anim_frame_warp_3d(turbo_prev_image, 204 | depth, 205 | translation_x_series, 206 | translation_y_series, 207 | translation_z_series, 208 | rotation_3d_x_series, 209 | rotation_3d_y_series, 210 | rotation_3d_z_series, 211 | config.near_plane, 212 | config.far_plane, 213 | config.fov, 214 | config.sampling_mode, 215 | config.padding_mode, 216 | tween_frame_idx) 217 | if advance_next: 218 | turbo_next_image = anim_frame_warp_3d(turbo_next_image, 219 | depth, 220 | translation_x_series, 221 | translation_y_series, 222 | translation_z_series, 223 | rotation_3d_x_series, 224 | rotation_3d_y_series, 225 | rotation_3d_z_series, 226 | config.near_plane, 227 | config.far_plane, 228 | config.fov, 229 | config.sampling_mode, 230 | config.padding_mode, 231 | tween_frame_idx) 232 | 233 | turbo_prev_frame_idx = turbo_next_frame_idx = tween_frame_idx 234 | 235 | if turbo_prev_image is not None and tween < 1.0: 236 | img = turbo_prev_image*(1.0-tween) + turbo_next_image*tween 237 | else: 238 | img = turbo_next_image 239 | 240 | # apply color matching 241 | if config.color_coherence != 'None': 242 | if color_match_sample is not None: 243 | img = maintain_colors(img, color_match_sample, config.color_coherence) 244 | 245 | # smoothen image 246 | if config.smooth != 'None': 247 | img = smoothen_image(Image.fromarray(img.astype(np.uint8)), config.smooth) 248 | img = np.array(img) 249 | 250 | init_image = Image.fromarray(img.astype(np.uint8)) 251 | filename = f"frame_{tween_frame_idx}.png" 252 | cv2.imwrite(os.path.join(outdir, filename), cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR)) 253 | if config.save_depth_maps: 254 | depth_model.save(os.path.join(outdir, f"depth_{tween_frame_idx:05}.png"), depth) 255 | if turbo_next_image is not None: 256 | prev_sample = turbo_next_image 257 | 258 | # apply transforms to previous frame 259 | if prev_sample is not None: 260 | if config.animation_mode == '2D': 261 | prev_img = anim_frame_warp_2d(prev_sample, W, H, angle_series, zoom_series, \ 262 | translation_x_series, translation_y_series, frame_idx) 263 | else: # '3D' 264 | prev_img_cv2 = prev_sample 265 | depth_model.midas_model = depth_model.midas_model.to(device) 266 | depth_model.device = device 267 | with torch.no_grad(): 268 | depth = depth_model.predict(prev_img_cv2, anim_args).cpu() if depth_model else None 269 | torch.cuda.empty_cache() 270 | depth_model.midas_model = depth_model.midas_model.to("cpu") 271 | depth_model.device = "cpu" 272 | prev_img = anim_frame_warp_3d(prev_img_cv2, 273 | depth, 274 | translation_x_series, 275 | translation_y_series, 276 | translation_z_series, 277 | rotation_3d_x_series, 278 | rotation_3d_y_series, 279 | rotation_3d_z_series, 280 | config.near_plane, 281 | config.far_plane, 282 | config.fov, 283 | config.sampling_mode, 284 | config.padding_mode, 285 | frame_idx) 286 | 287 | if config.color_coherence != 'None': 288 | if color_match_sample is None: 289 | color_match_sample = prev_img.copy() 290 | 291 | use_init = True 292 | 293 | # grab prompt for current frame 294 | prompt = prompt_series[frame_idx] 295 | print(f"\nSeed: {seed}\nPrompt: {prompt} \n") 296 | 297 | # grab init image for current frame 298 | if using_vid_init: 299 | init_frame = "./inputframes/" + "frame_" + str(frame_idx) + ".png" 300 | print(f"\nUsing video init frame {init_frame}") 301 | try: 302 | init_image = load_img(init_frame, (config.width, config.height)) 303 | except: 304 | frame_idx += 1 305 | continue 306 | 307 | # sample the diffusion model 308 | torch.cuda.empty_cache() 309 | if use_init: 310 | pipe_img2img = pipe_img2img.to(device) 311 | sample, image = generate(pipe_img2img, prompt, H, W, \ 312 | strength, seed, use_init, init_image, return_sample=True) 313 | pipe_img2img.to("cpu") 314 | else: 315 | pipe_txt2img = pipe_txt2img.to(device) 316 | sample, image = generate(pipe_txt2img, prompt, H, W, \ 317 | strength, seed, use_init, init_image, return_sample=True) 318 | pipe_txt2img.to("cpu") 319 | 320 | torch.cuda.empty_cache() 321 | if not using_vid_init: 322 | prev_sample = sample 323 | 324 | if turbo_steps > 1: 325 | turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx 326 | turbo_next_image, turbo_next_frame_idx = sample_to_cv2(sample, type=np.float32), frame_idx 327 | frame_idx += turbo_steps 328 | else: 329 | filename = f"frame_{frame_idx}.png" 330 | if using_vid_init and config.video_same_size: 331 | image = image.resize((video_width, video_height), resample=Image.LANCZOS) 332 | if not image.convert("L").getextrema() == (0, 0): # check for black image 333 | image.save(os.path.join(outdir, filename)) 334 | if config.save_depth_maps: 335 | if depth is None: 336 | depth = depth_model.predict(sample_to_cv2(sample), anim_args) 337 | depth_model.save(os.path.join(outdir, f"depth_{frame_idx:05}.png"), depth) 338 | frame_idx += 1 339 | 340 | seed = next_seed(seed, config.seed_behavior) 341 | 342 | 343 | def animate(use_local_model, save): 344 | """ Top level function for animate 2D, animate 3D, and animate video """ 345 | path = "./stable-diffusion-v1-4" if use_local_model else "CompVis/stable-diffusion-v1-4" 346 | local_files_only = use_local_model 347 | use_auth_token = not local_files_only 348 | 349 | # Get access token 350 | access_token = False 351 | if use_auth_token: 352 | access_token = input("\nEnter Hugging face user access token: ") 353 | 354 | print(f"\nMax cuda memory reserved before running the app: \ 355 | {torch.cuda.max_memory_reserved(torch.device('cuda'))} bytes\n") 356 | print("\nLoading Diffusion model...") 357 | pipe_txt2img = StableDiffusionPipeline.from_pretrained(path, 358 | use_auth_token=access_token, 359 | local_files_only=local_files_only, 360 | torch_dtype=torch.float16, 361 | revision='fp16') 362 | pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(path, 363 | use_auth_token=access_token, 364 | local_files_only=local_files_only, 365 | torch_dtype=torch.float16, 366 | revision='fp16') 367 | print("\nModel loaded successfully") 368 | 369 | if config.animation_mode == '2D' or config.animation_mode == '3D': 370 | render_animation(pipe_txt2img, pipe_img2img) 371 | elif config.animation_mode == 'Video Input': 372 | render_input_video(pipe_txt2img, pipe_img2img) 373 | else: 374 | print(f"\nInvalid animation mode {config.animation_mode}. \ 375 | Supported modes = [2D, 3D, Video Input].") 376 | 377 | if save: 378 | save_video(config.width, config.height) 379 | --------------------------------------------------------------------------------