:star: Text to Image :star:
118 | 119 | ```python 120 | python run.py --mode txt2img --device gpu --num 1 --limit --save 121 | ``` 122 | 123 | ||| 124 | |:-------------------------:|:-------------------------:| 125 | ||| 126 | ||| 127 | --- 128 |:star: Image to Image :star:
129 | 130 | ```python 131 | python run.py --mode img2img --device gpu --num 1 --limit --save 132 | ``` 133 | CLI inputs::star: Inpaint :star:
155 | 156 | ```python 157 | python run.py --mode inpaint --device gpu --num 1 --limit --save 158 | ``` 159 | CLI inputs::star: Dream :star:
184 | 185 | ```python 186 | python run.py --mode dream --device gpu --num 780 --limit --save 187 | ``` 188 | CLI inputs::star: Animate :star:
207 | 208 | |2D|3D| 209 | |:-------------------------:|:-------------------------:| 210 | | **TODO** |  | 211 | 212 | --- 213 | 214 | ## References :page_facing_up: 215 | * [stability.ai](https://stability.ai/blog/stable-diffusion-public-release) blog. 216 | * LDM [paper](https://arxiv.org/abs/2112.10752). 217 | * LDM [repo](https://github.com/CompVis/latent-diffusion). 218 | * [Hugging face diffuser](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion) for API usage. 219 | * [Gist](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355) by Andrej Karpathy. 220 | * [lexica.art](https://lexica.art/) for cool prompts. 221 | 222 | Happy Learning! 😄 223 | -------------------------------------------------------------------------------- /utils/pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import torch 4 | from torch import autocast 5 | from diffusers.schedulers import LMSDiscreteScheduler 6 | from diffusers import StableDiffusionPipeline, \ 7 | StableDiffusionImg2ImgPipeline, \ 8 | StableDiffusionInpaintPipeline 9 | 10 | from .utility import save_images, save_video, slerp 11 | from animation_mode.animation import animate 12 | 13 | class StableDiffusionPipe(): 14 | """ Pipline for Stable Diffusion model applications """ 15 | def __init__(self, use_local_model: bool = True, device: str = "cpu") -> None: 16 | self.use_local_model = use_local_model 17 | self.device = device if device == "cpu" else "cuda" 18 | 19 | def TexttoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 20 | """ Text to Image function """ 21 | 22 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 23 | local_files_only = self.use_local_model 24 | use_auth_token = not local_files_only 25 | 26 | # Get access token 27 | access_token = False 28 | if use_auth_token: 29 | access_token = input("\nEnter Hugging face user access token: ") 30 | 31 | # Load the model 32 | print("\nLoading model...") 33 | pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token, 34 | local_files_only=local_files_only, 35 | torch_dtype=torch.float16, revision='fp16') 36 | pipe = pipe.to(self.device) 37 | print("\nModel loaded successfully") 38 | 39 | # Get prompt 40 | prompt = input("\nEnter prompt: ") 41 | height, width = input("\nEnter height and width of image: ").split() 42 | height = int(height) 43 | width = int(width) 44 | 45 | # Convert height and width to multiple of 64 for model. 46 | height = height - height % 64 47 | width = width - width % 64 48 | 49 | # Generate images 50 | images = [] 51 | if use_limited_mem: 52 | prompts = [prompt] 53 | for _ in range (1, num_images + 1): 54 | print("\nRunning Text to Image generation...") 55 | with autocast(self.device): 56 | images.append(pipe(prompt=prompts, height=height, width=width).images[0]) 57 | else: 58 | print("\nRunning Text to Image generation...") 59 | prompts = [prompt] * num_images 60 | images = pipe(prompt=prompts, height=height, width=width).images 61 | 62 | # Save images 63 | if save: 64 | print("Saving images...") 65 | save_images(images) 66 | 67 | def ImagetoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 68 | """ Image to Image function """ 69 | 70 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 71 | local_files_only = self.use_local_model 72 | use_auth_token = not local_files_only 73 | 74 | # Get access token 75 | access_token = False 76 | if use_auth_token: 77 | access_token = input("\nEnter Hugging face user access token: ") 78 | 79 | # Load the model 80 | print("\nLoading model...") 81 | pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, use_auth_token=access_token, 82 | local_files_only=local_files_only, 83 | torch_dtype=torch.float16, revision='fp16') 84 | pipe = pipe.to(self.device) 85 | print("\nModel loaded successfully") 86 | 87 | # Get prompt 88 | image_path = input("\nEnter initial image path: ") 89 | prompt = input("\nEnter prompt: ") 90 | strength = float(input("\nEnter strength in [0, 1] range: ")) 91 | if not 0 <= strength <= 1: 92 | raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength)) 93 | 94 | init_image = Image.open(image_path).convert("RGB") 95 | width, height = init_image.size 96 | 97 | # Convert height and width to multiple of 64 for model. 98 | width = width - width % 64 99 | height = height - height % 64 100 | init_image = init_image.resize((width, height)) 101 | 102 | # Generate images 103 | images = [] 104 | if use_limited_mem: 105 | prompts = [prompt] 106 | for _ in range (1, num_images + 1): 107 | print("\nRunning Image to Image generation...") 108 | with autocast(self.device): 109 | images.append(pipe(prompt=prompts, 110 | init_image=init_image, 111 | strength=strength).images[0]) 112 | else: 113 | print("\nRunning Image to Image generation...") 114 | prompts = [prompt] * num_images 115 | images = pipe(prompt=prompts, 116 | init_image=init_image, 117 | strength=strength).images 118 | 119 | # Save images 120 | if save: 121 | print("Saving images...") 122 | save_images(images) 123 | 124 | def Inpaint(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True): 125 | """ Inpaint function """ 126 | 127 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 128 | local_files_only = self.use_local_model 129 | use_auth_token = not local_files_only 130 | 131 | # Get access token 132 | access_token = False 133 | if use_auth_token: 134 | access_token = input("\nEnter Hugging face user access token: ") 135 | 136 | # Load the model 137 | print("\nLoading model...") 138 | pipe = StableDiffusionInpaintPipeline.from_pretrained(path, use_auth_token=access_token, 139 | local_files_only=local_files_only, 140 | torch_dtype=torch.float16, revision='fp16') 141 | pipe = pipe.to(self.device) 142 | print("\nModel loaded successfully") 143 | 144 | # Get prompt 145 | image_path = input("\nEnter initial image path: ") 146 | mask_path = input("\nEnter mask image path: ") 147 | prompt = input("\nEnter prompt: ") 148 | strength = float(input("\nEnter strength in [0, 1] range: ")) 149 | if not 0 <= strength <= 1: 150 | raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength)) 151 | 152 | init_image = Image.open(image_path).convert("RGB") 153 | mask_image = Image.open(mask_path).convert("RGB") 154 | image_width, image_height = init_image.size 155 | mask_width, mask_height = mask_image.size 156 | 157 | if (not image_width == mask_width) or (not image_height == mask_height): 158 | raise ValueError("Init image size must match mask image size.") 159 | 160 | # Convert height and width to multiple of 64 for model. 161 | image_width = image_width - image_width % 64 162 | image_height = image_height - image_height % 64 163 | init_image = init_image.resize((image_width, image_height)) 164 | mask_image = mask_image.resize((image_width, image_height)) 165 | 166 | # Generate images 167 | images = [] 168 | if use_limited_mem: 169 | prompts = [prompt] 170 | for _ in range (1, num_images + 1): 171 | print("\nRunning Inpaint...") 172 | with autocast(self.device): 173 | images.append(pipe(prompt=prompts, 174 | init_image=init_image, 175 | mask_image=mask_image, 176 | strength=strength).images[0]) 177 | else: 178 | print("\nRunning Inpaint...") 179 | prompts = [prompt] * num_images 180 | images = pipe(prompt=prompts, 181 | init_image=init_image, 182 | mask_image=mask_image, 183 | strength=strength).images 184 | 185 | # Save images 186 | if save: 187 | print("Saving images...") 188 | save_images(images) 189 | 190 | def Dream(self, num_images: int = 1, save: bool = True): 191 | """ Dream function """ 192 | 193 | path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4" 194 | local_files_only = self.use_local_model 195 | use_auth_token = not local_files_only 196 | 197 | # Get access token 198 | access_token = False 199 | if use_auth_token: 200 | access_token = input("\nEnter Hugging face user access token: ") 201 | 202 | # Load the model 203 | print("\nLoading model...") 204 | lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") 205 | pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token, 206 | local_files_only=local_files_only, scheduler=lms, 207 | torch_dtype=torch.float16, revision='fp16') 208 | pipe = pipe.to(self.device) 209 | print("\nModel loaded successfully") 210 | 211 | # Get prompt 212 | prompt = input("\nEnter prompt: ") 213 | height, width = input("\nEnter height and width of image: ").split() 214 | height = int(height) 215 | width = int(width) 216 | 217 | # Convert height and width to multiple of 64 for model. 218 | height = height - height % 64 219 | width = width - width % 64 220 | 221 | source_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device) 222 | target_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device) 223 | 224 | images = [] 225 | print("\nDreaming...") 226 | for _, t in enumerate(np.linspace(0, 1, num_images)): 227 | init_latent = slerp(float(t), source_latent, target_latent) 228 | 229 | with autocast("cuda"): 230 | image = pipe(prompt, latents=init_latent).images[0] 231 | if not image.convert("L").getextrema() == (0, 0): # check for black image 232 | images.append(image) 233 | 234 | # Save images and video 235 | if save: 236 | print("Saving images...") 237 | save_images(images) 238 | print("Saving video...") 239 | save_video(images, width, height) 240 | 241 | def Animate(self, save: bool = True): 242 | """ Animate function """ 243 | print("\nUsing configurations from animation_mode/config.py") 244 | animate(self.use_local_model, save) 245 | -------------------------------------------------------------------------------- /animation_mode/animation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import sys 4 | from types import SimpleNamespace 5 | import cv2 6 | import pandas as pd 7 | import numpy as np 8 | from pytorch_lightning import seed_everything 9 | import torch 10 | from torch import autocast 11 | from torchvision import transforms 12 | from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline 13 | from PIL import Image 14 | 15 | sys.path.extend([ 16 | './animation_mode/src/taming-transformers', 17 | './animation_mode/src/clip', 18 | './animation_mode/stable-diffusion/', 19 | './animation_mode/k-diffusion', 20 | './animation_mode/AdaBins', 21 | './animation_mode/MiDaS', 22 | './animation_mode', 23 | ]) 24 | 25 | import config 26 | from .utility.utils import * 27 | from helpers import DepthModel 28 | 29 | 30 | def generate(pipe, 31 | prompt, 32 | height, 33 | width, 34 | strength, 35 | seed, 36 | use_init, 37 | init_image, 38 | return_sample=False): 39 | """ Image generator """ 40 | seed_everything(seed) 41 | device = "cuda" 42 | convert_tensor = transforms.ToTensor() 43 | 44 | results = [] 45 | if use_init: 46 | with autocast(device): 47 | with torch.no_grad(): 48 | image = pipe(prompt=prompt, 49 | init_image=init_image, 50 | strength=strength, 51 | guidance_scale=config.guidance_scale, 52 | num_inference_steps=config.num_inference_steps).images[0] 53 | torch.cuda.empty_cache() 54 | else: 55 | with autocast(device): 56 | with torch.no_grad(): 57 | image = pipe(prompt=prompt, height=height, width=width).images[0] 58 | torch.cuda.empty_cache() 59 | 60 | if return_sample: 61 | samples = convert_tensor(image) 62 | results.append(samples) 63 | results.append(image) 64 | 65 | return results 66 | 67 | 68 | def render_input_video(pipe_txt2img, pipe_img2img): 69 | """ Function for animate video """ 70 | # create a folder for the video input frames to live in 71 | video_in_frame_path = os.path.join(os.getcwd(), 'inputframes') 72 | os.makedirs(video_in_frame_path, exist_ok=True) 73 | 74 | # save the video frames from input video 75 | print(f"Exporting Video Frames from (1 every {config.extract_nth_frame}) \ 76 | frames to {video_in_frame_path}...") 77 | try: 78 | for f in pathlib.Path(video_in_frame_path).glob('*.png'): 79 | f.unlink() 80 | except: 81 | pass 82 | cap = cv2.VideoCapture(config.video_init_path) 83 | success, image = cap.read() 84 | count = 0 85 | while success: 86 | file_name = "inputframes/frame_" + str(count) + ".png" 87 | cv2.imwrite(file_name, image) 88 | success,image = cap.read() 89 | count = count + 1 + (config.extract_nth_frame - 1) 90 | if config.max_frames is not None and count > config.max_frames: 91 | break 92 | cap.release() 93 | 94 | # determine max frames from length of input frames 95 | num_frames = len([f for f in pathlib.Path(video_in_frame_path).glob('*.png')]) 96 | 97 | print(f"Loading {num_frames} input frames from {video_in_frame_path} \ 98 | and saving video frames to {video_in_frame_path}") 99 | render_animation(pipe_txt2img, pipe_img2img) 100 | 101 | 102 | def render_animation(pipe_txt2img, pipe_img2img): 103 | """ Function for animate 2D, animate 3D """ 104 | device = "cuda" 105 | W, H = (config.width, config.height) 106 | depth_model = None 107 | models_path = "./models" 108 | init_image = None 109 | video_width, video_height = None, None 110 | 111 | angle_series = get_inbetweens(parse_key_frames(config.angle)) 112 | zoom_series = get_inbetweens(parse_key_frames(config.zoom)) 113 | translation_x_series = get_inbetweens(parse_key_frames(config.translation_x)) 114 | translation_y_series = get_inbetweens(parse_key_frames(config.translation_y)) 115 | translation_z_series = get_inbetweens(parse_key_frames(config.translation_z)) 116 | rotation_3d_x_series = get_inbetweens(parse_key_frames(config.rotation_3d_x)) 117 | rotation_3d_y_series = get_inbetweens(parse_key_frames(config.rotation_3d_y)) 118 | rotation_3d_z_series = get_inbetweens(parse_key_frames(config.rotation_3d_z)) 119 | strength_schedule_series = get_inbetweens(parse_key_frames(config.strength_schedule)) 120 | midas_weight_dict = {"midas_weight":config.midas_weight} 121 | anim_args = SimpleNamespace(**midas_weight_dict) 122 | 123 | start_frame = 0 124 | outdir = os.path.join(os.getcwd(), r'images') 125 | if not os.path.exists(outdir): 126 | os.makedirs(outdir) 127 | print(f"\nSaving animation frames to {outdir}") 128 | 129 | # check for video inits 130 | using_vid_init = config.animation_mode == 'Video Input' 131 | use_init = using_vid_init 132 | 133 | max_frames = config.max_frames 134 | if using_vid_init: 135 | max_frames = len([f for f in pathlib.Path(os.path.join(os.getcwd(), 'inputframes')).glob('*.png')]) 136 | cap = cv2.VideoCapture(config.video_init_path) 137 | video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 138 | video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 139 | cap.release() 140 | 141 | # expand prompts out to per-frame 142 | prompt_series = pd.Series([np.nan for a in range(max_frames)]) 143 | for i, prompt in config.animation_prompts.items(): 144 | prompt_series[int(i)] = prompt 145 | prompt_series = prompt_series.ffill().bfill() 146 | 147 | # load depth model for 3D 148 | predict_depths = (config.animation_mode == '3D' and config.use_depth_warping) or config.save_depth_maps 149 | if predict_depths: 150 | depth_model = DepthModel("cpu") 151 | depth_model.load_midas(models_path) 152 | if config.midas_weight < 1.0: 153 | depth_model.load_adabins() 154 | else: 155 | depth_model = None 156 | config.save_depth_maps = False 157 | 158 | turbo_steps = 1 if using_vid_init else int(config.diffusion_cadence) 159 | turbo_prev_image, turbo_prev_frame_idx = None, 0 160 | turbo_next_image, turbo_next_frame_idx = None, 0 161 | 162 | # resume animation 163 | prev_sample = None 164 | color_match_sample = None 165 | frame_idx = start_frame 166 | 167 | seed = config.seed 168 | while frame_idx < max_frames: 169 | print(f"\nRendering animation frame {frame_idx} of {max_frames}") 170 | strength = strength_schedule_series[frame_idx] 171 | strength = max(0.0, min(1.0, strength)) 172 | depth = None 173 | 174 | # emit in-between frames 175 | if turbo_steps > 1: 176 | tween_frame_start_idx = max(0, frame_idx-turbo_steps) 177 | for tween_frame_idx in range(tween_frame_start_idx, frame_idx): 178 | tween = float(tween_frame_idx - tween_frame_start_idx + 1) / float(frame_idx - tween_frame_start_idx) 179 | print(f"creating in between frame {tween_frame_idx} tween:{tween:0.2f}") 180 | 181 | advance_prev = turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx 182 | advance_next = tween_frame_idx > turbo_next_frame_idx 183 | 184 | if depth_model is not None: 185 | assert turbo_next_image is not None 186 | depth_model.midas_model = depth_model.midas_model.to(device) 187 | depth_model.device = device 188 | with torch.no_grad(): 189 | depth = depth_model.predict(turbo_next_image, anim_args).cpu() 190 | torch.cuda.empty_cache() 191 | depth_model.midas_model = depth_model.midas_model.to("cpu") 192 | depth_model.device = "cpu" 193 | 194 | if config.animation_mode == '2D': 195 | if advance_prev: 196 | turbo_prev_image = anim_frame_warp_2d(turbo_prev_image, W, H, angle_series, zoom_series, \ 197 | translation_x_series, translation_y_series, tween_frame_idx) 198 | if advance_next: 199 | turbo_next_image = anim_frame_warp_2d(turbo_next_image, W, H, angle_series, zoom_series, \ 200 | translation_x_series, translation_y_series, tween_frame_idx) 201 | else: # '3D' 202 | if advance_prev: 203 | turbo_prev_image = anim_frame_warp_3d(turbo_prev_image, 204 | depth, 205 | translation_x_series, 206 | translation_y_series, 207 | translation_z_series, 208 | rotation_3d_x_series, 209 | rotation_3d_y_series, 210 | rotation_3d_z_series, 211 | config.near_plane, 212 | config.far_plane, 213 | config.fov, 214 | config.sampling_mode, 215 | config.padding_mode, 216 | tween_frame_idx) 217 | if advance_next: 218 | turbo_next_image = anim_frame_warp_3d(turbo_next_image, 219 | depth, 220 | translation_x_series, 221 | translation_y_series, 222 | translation_z_series, 223 | rotation_3d_x_series, 224 | rotation_3d_y_series, 225 | rotation_3d_z_series, 226 | config.near_plane, 227 | config.far_plane, 228 | config.fov, 229 | config.sampling_mode, 230 | config.padding_mode, 231 | tween_frame_idx) 232 | 233 | turbo_prev_frame_idx = turbo_next_frame_idx = tween_frame_idx 234 | 235 | if turbo_prev_image is not None and tween < 1.0: 236 | img = turbo_prev_image*(1.0-tween) + turbo_next_image*tween 237 | else: 238 | img = turbo_next_image 239 | 240 | # apply color matching 241 | if config.color_coherence != 'None': 242 | if color_match_sample is not None: 243 | img = maintain_colors(img, color_match_sample, config.color_coherence) 244 | 245 | # smoothen image 246 | if config.smooth != 'None': 247 | img = smoothen_image(Image.fromarray(img.astype(np.uint8)), config.smooth) 248 | img = np.array(img) 249 | 250 | init_image = Image.fromarray(img.astype(np.uint8)) 251 | filename = f"frame_{tween_frame_idx}.png" 252 | cv2.imwrite(os.path.join(outdir, filename), cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR)) 253 | if config.save_depth_maps: 254 | depth_model.save(os.path.join(outdir, f"depth_{tween_frame_idx:05}.png"), depth) 255 | if turbo_next_image is not None: 256 | prev_sample = turbo_next_image 257 | 258 | # apply transforms to previous frame 259 | if prev_sample is not None: 260 | if config.animation_mode == '2D': 261 | prev_img = anim_frame_warp_2d(prev_sample, W, H, angle_series, zoom_series, \ 262 | translation_x_series, translation_y_series, frame_idx) 263 | else: # '3D' 264 | prev_img_cv2 = prev_sample 265 | depth_model.midas_model = depth_model.midas_model.to(device) 266 | depth_model.device = device 267 | with torch.no_grad(): 268 | depth = depth_model.predict(prev_img_cv2, anim_args).cpu() if depth_model else None 269 | torch.cuda.empty_cache() 270 | depth_model.midas_model = depth_model.midas_model.to("cpu") 271 | depth_model.device = "cpu" 272 | prev_img = anim_frame_warp_3d(prev_img_cv2, 273 | depth, 274 | translation_x_series, 275 | translation_y_series, 276 | translation_z_series, 277 | rotation_3d_x_series, 278 | rotation_3d_y_series, 279 | rotation_3d_z_series, 280 | config.near_plane, 281 | config.far_plane, 282 | config.fov, 283 | config.sampling_mode, 284 | config.padding_mode, 285 | frame_idx) 286 | 287 | if config.color_coherence != 'None': 288 | if color_match_sample is None: 289 | color_match_sample = prev_img.copy() 290 | 291 | use_init = True 292 | 293 | # grab prompt for current frame 294 | prompt = prompt_series[frame_idx] 295 | print(f"\nSeed: {seed}\nPrompt: {prompt} \n") 296 | 297 | # grab init image for current frame 298 | if using_vid_init: 299 | init_frame = "./inputframes/" + "frame_" + str(frame_idx) + ".png" 300 | print(f"\nUsing video init frame {init_frame}") 301 | try: 302 | init_image = load_img(init_frame, (config.width, config.height)) 303 | except: 304 | frame_idx += 1 305 | continue 306 | 307 | # sample the diffusion model 308 | torch.cuda.empty_cache() 309 | if use_init: 310 | pipe_img2img = pipe_img2img.to(device) 311 | sample, image = generate(pipe_img2img, prompt, H, W, \ 312 | strength, seed, use_init, init_image, return_sample=True) 313 | pipe_img2img.to("cpu") 314 | else: 315 | pipe_txt2img = pipe_txt2img.to(device) 316 | sample, image = generate(pipe_txt2img, prompt, H, W, \ 317 | strength, seed, use_init, init_image, return_sample=True) 318 | pipe_txt2img.to("cpu") 319 | 320 | torch.cuda.empty_cache() 321 | if not using_vid_init: 322 | prev_sample = sample 323 | 324 | if turbo_steps > 1: 325 | turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx 326 | turbo_next_image, turbo_next_frame_idx = sample_to_cv2(sample, type=np.float32), frame_idx 327 | frame_idx += turbo_steps 328 | else: 329 | filename = f"frame_{frame_idx}.png" 330 | if using_vid_init and config.video_same_size: 331 | image = image.resize((video_width, video_height), resample=Image.LANCZOS) 332 | if not image.convert("L").getextrema() == (0, 0): # check for black image 333 | image.save(os.path.join(outdir, filename)) 334 | if config.save_depth_maps: 335 | if depth is None: 336 | depth = depth_model.predict(sample_to_cv2(sample), anim_args) 337 | depth_model.save(os.path.join(outdir, f"depth_{frame_idx:05}.png"), depth) 338 | frame_idx += 1 339 | 340 | seed = next_seed(seed, config.seed_behavior) 341 | 342 | 343 | def animate(use_local_model, save): 344 | """ Top level function for animate 2D, animate 3D, and animate video """ 345 | path = "./stable-diffusion-v1-4" if use_local_model else "CompVis/stable-diffusion-v1-4" 346 | local_files_only = use_local_model 347 | use_auth_token = not local_files_only 348 | 349 | # Get access token 350 | access_token = False 351 | if use_auth_token: 352 | access_token = input("\nEnter Hugging face user access token: ") 353 | 354 | print(f"\nMax cuda memory reserved before running the app: \ 355 | {torch.cuda.max_memory_reserved(torch.device('cuda'))} bytes\n") 356 | print("\nLoading Diffusion model...") 357 | pipe_txt2img = StableDiffusionPipeline.from_pretrained(path, 358 | use_auth_token=access_token, 359 | local_files_only=local_files_only, 360 | torch_dtype=torch.float16, 361 | revision='fp16') 362 | pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(path, 363 | use_auth_token=access_token, 364 | local_files_only=local_files_only, 365 | torch_dtype=torch.float16, 366 | revision='fp16') 367 | print("\nModel loaded successfully") 368 | 369 | if config.animation_mode == '2D' or config.animation_mode == '3D': 370 | render_animation(pipe_txt2img, pipe_img2img) 371 | elif config.animation_mode == 'Video Input': 372 | render_input_video(pipe_txt2img, pipe_img2img) 373 | else: 374 | print(f"\nInvalid animation mode {config.animation_mode}. \ 375 | Supported modes = [2D, 3D, Video Input].") 376 | 377 | if save: 378 | save_video(config.width, config.height) 379 | --------------------------------------------------------------------------------