├── images
    ├── 3.png
    ├── 49.png
    ├── 78.png
    ├── 93.png
    ├── rose.png
    ├── flower.png
    ├── img2img_1.png
    ├── mask_rose.png
    └── inpaint_rose.png
├── requirements.txt
├── setup.py
├── animation_mode
    ├── config.py
    ├── setup.py
    ├── README.md
    ├── utility
    │   └── utils.py
    └── animation.py
├── LICENSE
├── utils
    ├── utility.py
    └── pipeline.py
├── run.py
└── README.md


/images/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/3.png


--------------------------------------------------------------------------------
/images/49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/49.png


--------------------------------------------------------------------------------
/images/78.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/78.png


--------------------------------------------------------------------------------
/images/93.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/93.png


--------------------------------------------------------------------------------
/images/rose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/rose.png


--------------------------------------------------------------------------------
/images/flower.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/flower.png


--------------------------------------------------------------------------------
/images/img2img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/img2img_1.png


--------------------------------------------------------------------------------
/images/mask_rose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/mask_rose.png


--------------------------------------------------------------------------------
/images/inpaint_rose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logeswaran123/Stable-Diffusion-Playground/HEAD/images/inpaint_rose.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | argparse==1.1
 2 | diffusers==0.3.0
 3 | einops==0.4.1
 4 | jsonmerge==1.8.0
 5 | numpy==1.22.4
 6 | opencv-python==4.6.0.66
 7 | pandas==1.4.2
 8 | pytorch-lightning==1.7.4
 9 | scikit-image==0.19.3
10 | timm==0.6.7
11 | torchdiffeq==0.2.3
12 | transformers==4.21.2
13 | Pillow==9.0.1
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Setup for torch, torchvision
 2 | 
 3 | import subprocess
 4 | import time
 5 | 
 6 | 
 7 | print("Setting up environment...")
 8 | start_time = time.time()
 9 | 
10 | all_process = [
11 |     ['pip', 'install', 'torch==1.12.1+cu116', 'torchvision==0.13.1+cu116', '--extra-index-url', 'https://download.pytorch.org/whl/cu116'],
12 | ]
13 | 
14 | for process in all_process:
15 |     running = subprocess.run(process,stdout=subprocess.PIPE).stdout.decode('utf-8')
16 | 
17 | end_time = time.time()
18 | print(f"Environment set up in {end_time-start_time:.0f} seconds")


--------------------------------------------------------------------------------
/animation_mode/config.py:
--------------------------------------------------------------------------------
 1 | FPS=30
 2 | width=512
 3 | height=512
 4 | max_frames=300
 5 | seed=1185529623
 6 | seed_behavior="iter"
 7 | animation_mode="3D"
 8 | 
 9 | guidance_scale=2.5
10 | num_inference_steps=50
11 | diffusion_cadence=3
12 | border="wrap"
13 | angle="0:(0)"
14 | zoom="0:(0.0)"
15 | translation_x="0:(0)"
16 | translation_y="0:(0.0)"
17 | translation_z="0:(-2.5)"
18 | rotation_3d_x="0:(0)"
19 | rotation_3d_y="0:(0)"
20 | rotation_3d_z="0:(0)"
21 | strength_schedule="0:(0.55)"
22 | color_coherence="Match Frame 0 RGB"
23 | smooth='None'
24 | use_depth_warping=True
25 | midas_weight=0.3
26 | near_plane=200
27 | far_plane=10000
28 | fov=40
29 | padding_mode="border"
30 | sampling_mode="bicubic"
31 | save_depth_maps=False
32 | 
33 | video_init_path=""
34 | video_same_size=True
35 | extract_nth_frame=1
36 | 
37 | animation_prompts={
38 | 	"0":"detailed scroll painting of plants, trees and ocean by hokusai, 8k, sharp!!!",}
39 | 


--------------------------------------------------------------------------------
/animation_mode/setup.py:
--------------------------------------------------------------------------------
 1 | # Setup for animate mode
 2 | 
 3 | import subprocess
 4 | import time
 5 | 
 6 | 
 7 | print("Setting up environment...")
 8 | start_time = time.time()
 9 | 
10 | all_process = [
11 |     ['git', 'clone', 'https://github.com/deforum/stable-diffusion'],
12 |     ['git', 'clone', 'https://github.com/shariqfarooq123/AdaBins.git'],
13 |     ['git', 'clone', 'https://github.com/isl-org/MiDaS.git'],
14 |     ['git', 'clone', 'https://github.com/MSFTserver/pytorch3d-lite.git'],
15 |     ['pip', 'install', '-e', 'git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers'],
16 |     ['pip', 'install', '-e', 'git+https://github.com/openai/CLIP.git@main#egg=clip'],
17 | ]
18 | 
19 | for process in all_process:
20 |     running = subprocess.run(process,stdout=subprocess.PIPE).stdout.decode('utf-8')
21 | 
22 | print(subprocess.run(['git', 'clone', 'https://github.com/deforum/k-diffusion/'], stdout=subprocess.PIPE).stdout.decode('utf-8'))
23 | with open('k-diffusion/k_diffusion/__init__.py', 'w') as f:
24 |     f.write('')
25 | 
26 | end_time = time.time()
27 | print(f"Environment set up in {end_time-start_time:.0f} seconds")


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Logeswaran Sivakumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils/utility.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | FPS = 24
 7 | 
 8 | def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
 9 |     """ Helper function to spherically interpolate two arrays """
10 | 
11 |     if not isinstance(v0, np.ndarray):
12 |         inputs_are_torch = True
13 |         input_device = v0.device
14 |         v0 = v0.cpu().numpy()
15 |         v1 = v1.cpu().numpy()
16 | 
17 |     dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
18 |     if np.abs(dot) > DOT_THRESHOLD:
19 |         v2 = (1 - t) * v0 + t * v1
20 |     else:
21 |         theta_0 = np.arccos(dot)
22 |         sin_theta_0 = np.sin(theta_0)
23 |         theta_t = theta_0 * t
24 |         sin_theta_t = np.sin(theta_t)
25 |         s0 = np.sin(theta_0 - theta_t) / sin_theta_0
26 |         s1 = sin_theta_t / sin_theta_0
27 |         v2 = s0 * v0 + s1 * v1
28 | 
29 |     if inputs_are_torch:
30 |         v2 = torch.from_numpy(v2).to(input_device)
31 | 
32 |     return v2
33 | 
34 | def save_images(images):
35 |     """ Helper function to save images """
36 |     save_dir = os.path.join(os.getcwd(), r'images')
37 |     if not os.path.exists(save_dir):
38 |         os.makedirs(save_dir)
39 |     for i, image in enumerate(images):
40 |         image.save("images/image_" + str(i) + ".png")
41 | 
42 | def save_video(images, width, height):
43 |     """ Helper function to create and save video """
44 |     save_dir = os.path.join(os.getcwd(), r'images')
45 |     if not os.path.exists(save_dir):
46 |         os.makedirs(save_dir)
47 | 
48 |     out = cv2.VideoWriter("images/output.avi", # video file name
49 |                             cv2.VideoWriter_fourcc(*'MJPG'), # fourcc format
50 |                             FPS, # video fps
51 |                             (width, height) # (frame width, frame height)
52 |                         )
53 |     for _, pil_image in enumerate(images):
54 |         out.write(cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR))
55 |     out.release()
56 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from utils.pipeline import StableDiffusionPipe
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('-n', "--num", required=False, default=1,
 8 |                                         help="Number of images to generate", type=int)
 9 |     parser.add_argument('-l', "--local", required=False, action='store_true', default=False,
10 |                                         help="local model or download from huggingface")
11 |     parser.add_argument('-s', "--save", required=False, action='store_true', default=False,
12 |                                         help="Save generated image")
13 |     parser.add_argument('-d', "--device", required=False, default="gpu", choices=["cpu", "gpu"],
14 |                                         help="cpu or gpu device", type=str)
15 |     parser.add_argument('-m', "--mode", required=True, default="txt2img",
16 |                                         choices=["txt2img", "img2img", "inpaint", "dream", "animate"],
17 |                                         help="Select the mode", type=str)
18 |     parser.add_argument('-limit', "--limit", required=False, action='store_true', default=True,
19 |                                         help="Limited memory usage")
20 | 
21 |     args = parser.parse_args()
22 |     num_images = args.num
23 |     is_local_model = args.local
24 |     save = args.save
25 |     device = args.device
26 |     mode = args.mode
27 |     limit = args.limit
28 | 
29 |     pipe = StableDiffusionPipe(is_local_model, device)
30 | 
31 |     if mode.lower() == "txt2img":
32 |         pipe.TexttoImage(num_images, save, limit)
33 |     elif mode.lower() == "img2img":
34 |         pipe.ImagetoImage(num_images, save, limit)
35 |     elif mode.lower() == "inpaint":
36 |         pipe.Inpaint(num_images, save, limit)
37 |     elif mode.lower() == "dream":
38 |         pipe.Dream(num_images, save)
39 |     elif mode.lower() == "animate":
40 |         pipe.Animate(save)
41 |     else:
42 |         print(f"\n {mode} is an invalide mode. Select a valid mode.")
43 | 


--------------------------------------------------------------------------------
/animation_mode/README.md:
--------------------------------------------------------------------------------
 1 | ## Animate mode
 2 | Animate mode can generate "2D" or "3D" videos from input prompts. Also, it can perform Video-to-Video conversion of a "Video Input" based on input prompts.
 3 | 
 4 | ### Run command
 5 | Clone the repo, and run the cmds from Stable-Diffusion-Playground dir.
 6 | ```python
 7 | pip install -r requirements.txt
 8 | python setup.py
 9 | mkdir models
10 | mkdir pretrained
11 | cd animation_mode
12 | python setup.py
13 | cd ..
14 | ```
15 | ```python
16 | python run.py --mode animate --save
17 | ```
18 | <br />
19 | Animate mode uses configurations specified in ./animation_mode/config.py. Specify the configurations for video generation in this file.
20 | 
21 | ### Configurations
22 | | Argument         | Description                     | Choices                   |
23 | | ---------------- |:-------------------------------:|:-------------------------:|
24 | | FPS              | Frame rate of the output video  | Integer number            |
25 | | width            | width of the frame              | Integer number            |
26 | | height           | height of the frame             | Integer number            |
27 | | max_frames       | Number of frames in the video   | Integer number            |
28 | | seed             | Seed value for frame generation | Integer number            |
29 | | seed_behavior    | Seed mode                       | "iter", "fixed"           |
30 | | animation_mode   | Mode of animation               | "2D", "3D", "Video Input" |
31 | | guidance_scale   | Indicates how much output should be linked to prompt | Float number.<br /> Allowed: guidance_scale > 1. |
32 | | num_inference_steps | Number of denoising steps | Integer number |
33 | | diffusion_cadence | number of frames to generate between frames | Integer number.<br /> Allowed: > 1 for "2D", "3D" animation_mode |
34 | | border | Border mode used in image transformation | "wrap", "replicate" |
35 | | angle | Angle of rotation in degrees | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - integer |
36 | | zoom | Amount of zoom | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
37 | | translation_x | Amount translation along X-axis | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
38 | | translation_y | Amount translation along Y-axis | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
39 | | translation_z | Amount translation along Z-axis | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
40 | | rotation_3d_x | Amount rotation parallel to X-axis.<br /> Used only for "3D" animation_mode | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
41 | | rotation_3d_y | Amount rotation parallel to Y-axis.<br /> Used only for "3D" animation_mode | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
42 | | rotation_3d_z | Amount rotation parallel to Z-axis.<br /> Used only for "3D" animation_mode | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float |
43 | | strength_schedule | Indicates how much to transform the current frame from previous frame | String. Format: "frame_id:(value)".<br /> frame_id - integer, value - float.<br /> Allowed: values in range (0, 1] |
44 | | color_coherence | Match the color of generated frames to first frame | "None", "Match Frame 0 RGB", "Match Frame 0 HSV", "Match Frame 0 LAB" |
45 | | smooth | Smoothen image | "None", "Smooth", "Smooth_more" |
46 | | use_depth_warping | Warp image by depth prediction.<br /> Used only for "3D" animation_mode | Bool.<br /> Allowed: True, False  |
47 | | midas_weight | If <1.0, loads AdaBins model, else loads midas model.<br /> Used only for "3D" animation_mode | Float number.<br /> Allowed: midas_weight > 0.0 |
48 | | near_plane | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.<br /> Used only for "3D" animation_mode | Integer number |
49 | | far_plane | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.<br /> Used only for "3D" animation_mode | Integer number |
50 | | fov | Used in image transformation. Used in py3d_tools.FoVPerspectiveCameras.<br /> Used only for "3D" animation_mode | Integer number |
51 | | padding_mode | Padding mode in image transformation.<br /> Used in torch.nn.functional.grid_sample.<br /> Used only for "3D" animation_mode | "zeros", "border", "reflection" |
52 | | sampling_mode | Sampling mode in image transformation.<br /> Used in torch.nn.functional.grid_sample.<br /> Used only for "3D" animation_mode | "bilinear", "nearest", "bicubic" |
53 | | save_depth_maps | Save the predicted depth maps | Bool.<br /> Allowed: True, False |
54 | | video_init_path | Path to video file.<br /> Used only for "Video Input" animation_mode | String |
55 | | video_same_size | Indicates if output video should be same size as input video.<br /> Used only for "Video Input" animation_mode | Bool.<br /> Allowed: True, False |
56 | | extract_nth_frame | Extract every nth frame from video.<br /> Used only for "Video Input" animation_mode | Integer number |
57 | | animation_prompts | Dictionary with key as frame id and value as prompt | Format: {"frame_id":promt}.<br /> frame_id - Integer number, promt - String |
58 | 
59 | <br />
60 | <b>Note:</b><br />
61 | angle, zoom, translation_x, translation_y, translation_z, rotation_3d_x, rotation_3d_y, rotation_3d_z, strength_schedule can take a series of values.<br />
62 | It should be in format: "frame_id:(value),frame_id:(value),..."<br /><br />
63 | For example, angle="0:(0),10:(30),20:(-30)". This means that starting from frame 0 till frame 9, the frames will have no angle change. Then, from frame 10 till frame 19, the frames will rotate clock-wise by 30 degrees. Then, from frame 20 till end of video or max_frames, the frames will rotate anti-clockwise by 30 degrees.
64 | 
65 | ---
66 | 
67 | animation_prompts can take a series of prompts.<br />
68 | It should be in format: {"frame_id":prompt, "frame_id":prompt, ...}<br /><br />
69 | For example, animation_prompts = {"0":"White clouds in blue sky, realistic, 8k!!!", "100":"Aeroplane in blue sky, realistic!!"}. This means from frame 0 till frame 99, the generated frames will be based on prompt "White clouds in blue sky, realistic, 8k!!!". Then, from frame 100 till end of video or max_frames, the frames will be based on prompt "Aeroplane in blue sky, realistic!!".
70 | 
71 | ---
72 | 


--------------------------------------------------------------------------------
/animation_mode/utility/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import requests
  5 | 
  6 | import sys
  7 | import cv2
  8 | from einops import rearrange
  9 | import numpy as np
 10 | import pandas as pd
 11 | from skimage.exposure import match_histograms
 12 | import torch
 13 | from PIL import Image, ImageFilter
 14 | 
 15 | sys.path.extend([
 16 |     './animation_mode/pytorch3d-lite',
 17 | ])
 18 | 
 19 | from ..config import *
 20 | import py3d_tools as p3d
 21 | 
 22 | 
 23 | def add_noise(sample: torch.Tensor, noise_amt: float) -> torch.Tensor:
 24 |     return sample + torch.randn(sample.shape, device=sample.device) * noise_amt
 25 | 
 26 | def anim_frame_warp_2d(prev_img_cv2,
 27 |                         W,
 28 |                         H,
 29 |                         angle_series,
 30 |                         zoom_series,
 31 |                         translation_x_series,
 32 |                         translation_y_series,
 33 |                         frame_idx):
 34 |     angle = angle_series[frame_idx]
 35 |     zoom = zoom_series[frame_idx]
 36 |     translation_x = translation_x_series[frame_idx]
 37 |     translation_y = translation_y_series[frame_idx]
 38 | 
 39 |     center = (W // 2, H // 2)
 40 |     trans_mat = np.float32([[1, 0, translation_x], [0, 1, translation_y]])
 41 |     rot_mat = cv2.getRotationMatrix2D(center, angle, zoom)
 42 |     trans_mat = np.vstack([trans_mat, [0, 0, 1]])
 43 |     rot_mat = np.vstack([rot_mat, [0, 0, 1]])
 44 |     xform = np.matmul(rot_mat, trans_mat)
 45 | 
 46 |     return cv2.warpPerspective(
 47 |         prev_img_cv2,
 48 |         xform,
 49 |         (prev_img_cv2.shape[1], prev_img_cv2.shape[0]),
 50 |         borderMode=cv2.BORDER_WRAP if border == 'wrap' else cv2.BORDER_REPLICATE
 51 |     )
 52 | 
 53 | def anim_frame_warp_3d(prev_img_cv2,
 54 |                         depth,
 55 |                         translation_x_series,
 56 |                         translation_y_series,
 57 |                         translation_z_series,
 58 |                         rotation_3d_x_series,
 59 |                         rotation_3d_y_series,
 60 |                         rotation_3d_z_series,
 61 |                         near_plane,
 62 |                         far_plane,
 63 |                         fov,
 64 |                         sampling_mode,
 65 |                         padding_mode,
 66 |                         frame_idx):
 67 |     device = "cuda"
 68 |     TRANSLATION_SCALE = 1.0/200.0 # matches Disco
 69 |     translate_xyz = [
 70 |         -translation_x_series[frame_idx] * TRANSLATION_SCALE,
 71 |         translation_y_series[frame_idx] * TRANSLATION_SCALE,
 72 |         -translation_z_series[frame_idx] * TRANSLATION_SCALE
 73 |     ]
 74 |     rotate_xyz = [
 75 |         math.radians(rotation_3d_x_series[frame_idx]),
 76 |         math.radians(rotation_3d_y_series[frame_idx]),
 77 |         math.radians(rotation_3d_z_series[frame_idx])
 78 |     ]
 79 |     rot_mat = p3d.euler_angles_to_matrix(torch.tensor(rotate_xyz, device=device), "XYZ").unsqueeze(0)
 80 |     result = transform_image_3d(prev_img_cv2, depth, rot_mat, translate_xyz, \
 81 |                     near_plane, far_plane, fov, sampling_mode, padding_mode)
 82 |     torch.cuda.empty_cache()
 83 |     return result
 84 | 
 85 | def get_inbetweens(key_frames, integer=False, interp_method='Linear'):
 86 |     key_frame_series = pd.Series([np.nan for a in range(max_frames)])
 87 | 
 88 |     for i, value in key_frames.items():
 89 |         key_frame_series[i] = value
 90 |     key_frame_series = key_frame_series.astype(float)
 91 | 
 92 |     if interp_method == 'Cubic' and len(key_frames.items()) <= 3:
 93 |         interp_method = 'Quadratic'
 94 |     if interp_method == 'Quadratic' and len(key_frames.items()) <= 2:
 95 |         interp_method = 'Linear'
 96 | 
 97 |     key_frame_series[0] = key_frame_series[key_frame_series.first_valid_index()]
 98 |     key_frame_series[max_frames-1] = key_frame_series[key_frame_series.last_valid_index()]
 99 |     key_frame_series = key_frame_series.interpolate(method=interp_method.lower(), limit_direction='both')
100 |     if integer:
101 |         return key_frame_series.astype(int)
102 |     return key_frame_series
103 | 
104 | def load_img(path, shape):
105 |     if path.startswith('http://') or path.startswith('https://'):
106 |         image = Image.open(requests.get(path, stream=True).raw)
107 |     else:
108 |         image = Image.open(path)
109 | 
110 |     image = image.resize(shape, resample=Image.LANCZOS)
111 | 
112 |     return image
113 | 
114 | def maintain_colors(prev_img, color_match_sample, mode):
115 |     if mode == 'Match Frame 0 RGB':
116 |         return match_histograms(prev_img, color_match_sample, multichannel=True)
117 |     elif mode == 'Match Frame 0 HSV':
118 |         prev_img_hsv = cv2.cvtColor(prev_img, cv2.COLOR_RGB2HSV)
119 |         color_match_hsv = cv2.cvtColor(color_match_sample, cv2.COLOR_RGB2HSV)
120 |         matched_hsv = match_histograms(prev_img_hsv, color_match_hsv, multichannel=True)
121 |         return cv2.cvtColor(matched_hsv, cv2.COLOR_HSV2RGB)
122 |     else: # 'Match Frame 0 LAB'
123 |         prev_img_lab = cv2.cvtColor(prev_img, cv2.COLOR_RGB2LAB)
124 |         color_match_lab = cv2.cvtColor(color_match_sample, cv2.COLOR_RGB2LAB)
125 |         matched_lab = match_histograms(prev_img_lab, color_match_lab, multichannel=True)
126 |         return cv2.cvtColor(matched_lab, cv2.COLOR_LAB2RGB)
127 | 
128 | def next_seed(seed, seed_behavior):
129 |     if seed_behavior == 'iter':
130 |         seed += 1
131 |     elif seed_behavior == 'fixed':
132 |         pass # always keep seed the same
133 |     else:
134 |         seed = random.randint(0, 2**32)
135 |     return seed
136 | 
137 | def parse_key_frames(string, prompt_parser=None):
138 |     import re
139 |     pattern = r'((?P<frame>[0-9]+):[\s]*[\(](?P<param>[\S\s]*?)[\)])'
140 |     frames = dict()
141 |     for match_object in re.finditer(pattern, string):
142 |         frame = int(match_object.groupdict()['frame'])
143 |         param = match_object.groupdict()['param']
144 |         if prompt_parser:
145 |             frames[frame] = prompt_parser(param)
146 |         else:
147 |             frames[frame] = param
148 |     if frames == {} and len(string) != 0:
149 |         raise RuntimeError('Key Frame string not correctly formatted')
150 |     return frames
151 | 
152 | def sample_from_cv2(sample: np.ndarray) -> torch.Tensor:
153 |     sample = ((sample.astype(float) / 255.0) * 2) - 1
154 |     sample = sample[None].transpose(0, 3, 1, 2).astype(np.float16)
155 |     sample = torch.from_numpy(sample)
156 |     return sample
157 | 
158 | def sample_to_cv2(sample: torch.Tensor, type=np.uint8) -> np.ndarray:
159 |     sample_f32 = rearrange(sample.squeeze().cpu().numpy(), "c h w -> h w c").astype(np.float32)
160 |     sample_int8 = (sample_f32 * 255)
161 |     return sample_int8.astype(type)
162 | 
163 | def save_video(width, height):
164 |     """ Helper function to create and save video """
165 |     frames_dir = os.path.join(os.getcwd(), r'images')
166 |     if not os.path.exists(frames_dir):
167 |         print(f"\nNo generated {frames_dir} dir found.")
168 |         return
169 | 
170 |     save_dir = os.path.join(os.getcwd(), r'out_video')
171 |     if not os.path.exists(save_dir):
172 |         os.makedirs(save_dir)
173 | 
174 |     print("\nCreating video from generated frames...")
175 |     out = cv2.VideoWriter("out_video/output.avi", # video file name
176 |                             cv2.VideoWriter_fourcc(*'MJPG'), # fourcc format
177 |                             FPS, # video fps
178 |                             (width, height) # (frame width, frame height)
179 |                         )
180 |     for count in range(0, max_frames):
181 |         filename = "frame_" + str(count) + ".png"
182 |         try:
183 |             out.write(cv2.imread(os.path.join(frames_dir, filename)))
184 |         except:
185 |             pass
186 |     out.release()
187 |     print(f"\nVideo saved in {os.path.join(save_dir, 'out_video.avi')}")
188 | 
189 | def smoothen_image(image, mode):
190 |     if mode == 'Smooth':
191 |         return image.filter(ImageFilter.SMOOTH)
192 |     else: # 'SMOOTH_MORE'
193 |         return image.filter(ImageFilter.SMOOTH_MORE)
194 | 
195 | def transform_image_3d(prev_img_cv2,
196 |                         depth_tensor,
197 |                         rot_mat,
198 |                         translate,
199 |                         near_plane,
200 |                         far_plane,
201 |                         fov,
202 |                         sampling_mode,
203 |                         padding_mode):
204 |     # adapted and optimized version of transform_image_3d
205 |     # from Disco Diffusion https://github.com/alembics/disco-diffusion
206 |     device = "cuda"
207 |     w, h = prev_img_cv2.shape[1], prev_img_cv2.shape[0]
208 | 
209 |     aspect_ratio = float(w) / float(h)
210 |     near, far, fov_deg = near_plane, far_plane, fov
211 |     persp_cam_old = p3d.FoVPerspectiveCameras(near, far, aspect_ratio, fov=fov_deg, degrees=True, device=device)
212 |     persp_cam_new = p3d.FoVPerspectiveCameras(near, far, aspect_ratio, fov=fov_deg, degrees=True, \
213 |                         R=rot_mat, T=torch.tensor([translate]), device=device)
214 | 
215 |     # range of [-1,1] is important to torch grid_sample's padding handling
216 |     y, x = torch.meshgrid(torch.linspace(-1., 1. , h, dtype=torch.float32, device=device), \
217 |                  torch.linspace(-1., 1., w, dtype=torch.float32, device=device))
218 |     z = torch.as_tensor(depth_tensor, dtype=torch.float32, device=device)
219 |     xyz_old_world = torch.stack((x.flatten(), y.flatten(), z.flatten()), dim=1)
220 | 
221 |     xyz_old_cam_xy = persp_cam_old.get_full_projection_transform().transform_points(xyz_old_world)[:,0:2]
222 |     xyz_new_cam_xy = persp_cam_new.get_full_projection_transform().transform_points(xyz_old_world)[:,0:2]
223 | 
224 |     offset_xy = xyz_new_cam_xy - xyz_old_cam_xy
225 |     # affine_grid theta param expects a batch of 2D mats. Each is 2x3 to do rotation+translation.
226 |     identity_2d_batch = torch.tensor([[1.,0.,0.], [0.,1.,0.]], device=device).unsqueeze(0)
227 |     # coords_2d will have shape (N,H,W,2).. which is also what grid_sample needs.
228 |     coords_2d = torch.nn.functional.affine_grid(identity_2d_batch, [1, 1, h, w], align_corners=False)
229 |     offset_coords_2d = coords_2d - torch.reshape(offset_xy, (h,w,2)).unsqueeze(0)
230 | 
231 |     image_tensor = rearrange(torch.from_numpy(prev_img_cv2.astype(np.float32)), 'h w c -> c h w').to(device)
232 |     new_image = torch.nn.functional.grid_sample(
233 |         image_tensor.add(1/512 - 0.0001).unsqueeze(0),
234 |         offset_coords_2d,
235 |         mode=sampling_mode,
236 |         padding_mode=padding_mode,
237 |         align_corners=False
238 |     )
239 | 
240 |     # convert back to cv2 style numpy array
241 |     result = rearrange(
242 |         new_image.squeeze().clamp(0,255),
243 |         'c h w -> h w c'
244 |     ).cpu().numpy().astype(prev_img_cv2.dtype)
245 |     return result
246 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ⛹️‍♀️:basketball: Stable-Diffusion-Playground :soccer:⛹️
  2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/LICENSE)
  3 | 
  4 | An application that generates images or videos using Stable Diffusion models.
  5 | 
  6 | ## Description :scroll:
  7 | What is the term "diffusion"? <br />
  8 | 
  9 | From Wikipedia, "Diffusion is the net movement of anything (for example, atoms, ions, molecules, energy) generally from a region of higher concentration to a region of lower concentration." <br />
 10 | 
 11 | Similar to the definition, diffusion models apply noise to an image sequentially across multiple steps in forward pass. This essentially diffuses the pixels. In the backward pass, the noisy image is denoised across same steps. Since it is a sequential process, there is less chance of mode collapse (a problem with GANs) to occur. <br />
 12 | 
 13 | Most diffusion models use UNet architecture to preserve the dimensionality of the image. Usually, diffusion models apply diffusion in pixel space, but stable diffusion models apply diffusion in latent space. Hence, the term "Latent diffusion model (LDM)". The conversion between pixel space to latent space is done using Encoder and Decoder. This method is memory efficient compared to previous methods, and also produces highly detailed image. <br />
 14 | 
 15 | Read through the [paper](https://arxiv.org/abs/2112.10752) for more details. Big-ups to the researchers/creators for the work and for open-sourcing it. <br />
 16 | 
 17 | ## General Requirements :mage_man:
 18 | * Atleast 6GB of VRAM is required to generate a single 512x512 image.
 19 | * For better image generation, use descriptive and detailed prompt.
 20 | 
 21 | ## Code Requirements :mage_woman:
 22 | Use Python 3.8.13. Setup conda environment, git clone repo and run the below commands,
 23 | ```python
 24 | pip install -r requirements.txt
 25 | python setup.py
 26 | mkdir models
 27 | mkdir pretrained
 28 | cd animation_mode
 29 | python setup.py
 30 | cd ..
 31 | ```
 32 | 
 33 | ## How to run :running_man:
 34 | 
 35 | <b> Command line arguments: </b>
 36 | | Argument         | Requirement   | Default | Choices                       | Description  |
 37 | | ---------------- |:-------------:|:-------:|:-----------------------------:| :------------|
 38 | | --mode / -m      | True          | -       | "txt2img", "img2img", "inpaint", "dream", "animate" | Mode of application. |
 39 | | --local / -l     | False         | False   | True / False                  | If argument is provided, use local model files. Else download from hugging face. |
 40 | | --device / -d    | False         | "cpu"   | "cpu", "gpu"                  | Run on target device. |
 41 | | --num / -n       | False         | 1       | integer number                | Number of images to generate. |
 42 | | --save / -s      | False         | False   | True / False                  | If argument is provided, save generated images. |
 43 | | --limit / -limit | False         | True    | True / False                  | If argument is provided, limit memory usage. |
 44 | 
 45 | There are five different modes of running the application, <br />
 46 | * Text to Image (txt2img)
 47 | * Image to Image (img2img)
 48 | * Inpaint (inpaint)
 49 | * Dream (dream)
 50 | * Animate (animate) - sub-modes: 2D, 3D, Video Input
 51 | 
 52 | <b> Mode: Text to Image </b> <br />
 53 | ```python
 54 | python run.py --mode txt2img --device gpu --save
 55 | ```
 56 | 
 57 | <b> Mode: Image to Image </b> <br />
 58 | ```python
 59 | python run.py --mode img2img --device gpu --save
 60 | ```
 61 | 
 62 | <b> Mode: Inpaint </b> <br />
 63 | ```python
 64 | python run.py --mode inpaint --device gpu --save
 65 | ```
 66 | 
 67 | <b> Mode: Dream </b> <br />
 68 | ```python
 69 | python run.py --mode dream --device gpu --save --num <number of frames>
 70 | ```
 71 | 
 72 | <b> Mode: Animate </b> <br />
 73 | ```python
 74 | python run.py --mode animate --device gpu --save
 75 | ```
 76 | Note: <br />
 77 | * For each of the modes, run the command and follow the cli to provide hugging face user token, prompt and size (Height, Width) of image. <br />
 78 | * Generated images or video will be saved to $PWD/images dir. For animate mode, video will be saved to $PWD/out_video dir.
 79 | * Single 512x512 image generation takes ~12 seconds on NVIDIA GeForce RTX 3060 with 6GB VRAM.
 80 | * Dream mode will generate --num image frames based on input prompt, and create a video. <br />
 81 | * Image to Image mode will generate new image from initial image and input prompt. Inpaint mode will generate the masked part of image from initial image, mask image and input prompt. The strength input in CLI will indicate the amount of change from initial image. In range [0, 1]; with 0 indicating no change and 1 indicating complete change from original image.
 82 | 
 83 | <b> Hugging face Access Token: </b><br />
 84 | * Create an account in [huggingface.co](https://huggingface.co/). Go to Settings -> Access Tokens. Create an access token with read permission. <br />
 85 | 
 86 | ### How to use Animate mode :paintbrush:
 87 | This implemetation is an optimized version of [DeforumStableDiffusionLocal](https://github.com/HelixNGC7293/DeforumStableDiffusionLocal) and [Deforum_Stable_Diffusion.ipynb](https://colab.research.google.com/github/deforum/stable-diffusion/blob/main/Deforum_Stable_Diffusion.ipynb). Thanks for their work.<br /><br />
 88 | Animate mode is quite different from the other modes of the app. Animate mode can generate "2D" or "3D" videos from input prompts. Also, it can perform Video-to-Video conversion of a "Video Input" based on input prompts. <br />
 89 | 
 90 | To use this mode, follow the below steps, <br />
 91 | 
 92 | #### Requirements
 93 | Clone the repo, and run the following cmds, 
 94 | ```python
 95 | pip install -r requirements.txt
 96 | python setup.py
 97 | mkdir models
 98 | mkdir pretrained
 99 | cd animation_mode
100 | python setup.py
101 | cd ..
102 | ```
103 | 
104 | Next, manually download the models,
105 | * Download [dpt_large-midas-2f21e586.pt](https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt) and place it in ./models dir.
106 | * Download [AdaBins_nyu.pt](https://cloudflare-ipfs.com/ipfs/Qmd2mMnDLWePKmgfS8m6ntAg4nhV5VkUyAydYBp8cWWeB7/AdaBins_nyu.pt) and place it in ./pretrained dir.
107 | 
108 | Animate mode uses configurations specified in ./animation_mode/config.py. Specify the configurations for video generation in this file. Refer [animation_mode/README.md](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/animation_mode/README.md) for details on parameters usage in config.py.
109 | 
110 | #### Run command
111 | ```python
112 | python run.py --mode animate --save
113 | ```
114 | Generated video will be saved to ./out_video dir.
115 | 
116 | ## Results :bar_chart:
117 | <p align="center"> :star: <b> Text to Image </b> :star: </p>
118 | 
119 | ```python
120 | python run.py --mode txt2img --device gpu --num 1 --limit --save
121 | ```
122 | 
123 | |||
124 | |:-------------------------:|:-------------------------:|
125 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/49.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/3.png)|
126 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/78.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/93.png)|
127 | ---
128 | <p align="center"> :star: <b> Image to Image </b> :star: </p>
129 | 
130 | ```python
131 | python run.py --mode img2img --device gpu --num 1 --limit --save
132 | ```
133 | CLI inputs: <br />
134 | ```python
135 | Enter Hugging face user access token: <user access token>
136 | 
137 | Loading model...
138 | 
139 | Model loaded successfully
140 | 
141 | Enter initial image path: flower.png
142 | 
143 | Enter prompt: beautiful red flower, vibrant, realistic, smooth, bokeh, highly detailed, 4k
144 | 
145 | Enter strength in [0, 1] range: 0.8
146 | 
147 | Running Image to Image generation...
148 | ```
149 | |||
150 | |:-------------------------:|:-------------------------:|
151 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/flower.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/img2img_1.png)|
152 | 
153 | ---
154 | <p align="center"> :star: <b> Inpaint </b> :star: </p>
155 | 
156 | ```python
157 | python run.py --mode inpaint --device gpu --num 1 --limit --save
158 | ```
159 | CLI inputs: <br />
160 | ```python
161 | Enter Hugging face user access token: <user access token>
162 | 
163 | Loading model...
164 | 
165 | Model loaded successfully
166 | 
167 | Enter initial image path: rose.png
168 | 
169 | Enter mask image path: mask_rose.png
170 | 
171 | Enter prompt: beautiful blue butterfly on a rose, glossy, detailed, sharp, 4k
172 | 
173 | Enter strength in [0, 1] range: 0.8
174 | 
175 | Running Inpaint...
176 | ```
177 | 
178 | | Initial image | Mask | Inpainted image |
179 | |:-------------------------:|:-------------------------:|:-------------------------:|
180 | |![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/rose.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/mask_rose.png)|![](https://github.com/Logeswaran123/Stable-Diffusion-Playground/blob/main/images/inpaint_rose.png)|
181 | 
182 | ---
183 | <p align="center"> :star: <b> Dream </b> :star: </p>
184 | 
185 | ```python
186 | python run.py --mode dream --device gpu --num 780 --limit --save
187 | ```
188 | CLI inputs: <br />
189 | ```python
190 | Enter Hugging face user access token: <user access token>
191 | 
192 | Loading model...
193 | 
194 | Model loaded successfully
195 | 
196 | Enter prompt: highly detailed bowl of lucrative ramen, stephen bliss, unreal engine, fantasy art by greg rutkowski, loish, rhads and lois van baarle, ilya kuvshinov, rossdraws, tom bagshaw, alphonse mucha, global illumination, detailed and intricate environment
197 | 
198 | Enter height and width of image: 512 512
199 | 
200 | Dreaming...
201 | ```
202 | 
203 | https://user-images.githubusercontent.com/36563521/192521369-32673804-009f-44c6-918c-a7746cc94dba.mp4
204 | 
205 | ---
206 | <p align="center"> :star: <b> Animate </b> :star: </p>
207 | 
208 | |2D|3D|
209 | |:-------------------------:|:-------------------------:|
210 | | **TODO** | ![boat_in_storm](https://user-images.githubusercontent.com/36563521/194770440-db663425-282c-4aba-8b1a-2fb8db8bd6d0.gif) |
211 | 
212 | ---
213 | 
214 | ## References :page_facing_up:
215 | * [stability.ai](https://stability.ai/blog/stable-diffusion-public-release) blog.
216 | * LDM [paper](https://arxiv.org/abs/2112.10752).
217 | * LDM [repo](https://github.com/CompVis/latent-diffusion).
218 | * [Hugging face diffuser](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion) for API usage.
219 | * [Gist](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355) by Andrej Karpathy.
220 | * [lexica.art](https://lexica.art/) for cool prompts.
221 | 
222 | Happy Learning! 😄
223 | 


--------------------------------------------------------------------------------
/utils/pipeline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import torch
  4 | from torch import autocast
  5 | from diffusers.schedulers import LMSDiscreteScheduler
  6 | from diffusers import StableDiffusionPipeline, \
  7 |                         StableDiffusionImg2ImgPipeline, \
  8 |                         StableDiffusionInpaintPipeline
  9 | 
 10 | from .utility import save_images, save_video, slerp
 11 | from animation_mode.animation import animate
 12 | 
 13 | class StableDiffusionPipe():
 14 |     """ Pipline for Stable Diffusion model applications """
 15 |     def __init__(self, use_local_model: bool = True, device: str = "cpu") -> None:
 16 |         self.use_local_model = use_local_model
 17 |         self.device = device if device == "cpu" else "cuda"
 18 | 
 19 |     def TexttoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True):
 20 |         """ Text to Image function """
 21 | 
 22 |         path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4"
 23 |         local_files_only = self.use_local_model
 24 |         use_auth_token = not local_files_only
 25 | 
 26 |         # Get access token
 27 |         access_token = False
 28 |         if use_auth_token:
 29 |             access_token = input("\nEnter Hugging face user access token: ")
 30 | 
 31 |         # Load the model
 32 |         print("\nLoading model...")
 33 |         pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token,
 34 |                                     local_files_only=local_files_only,
 35 |                                     torch_dtype=torch.float16, revision='fp16')
 36 |         pipe = pipe.to(self.device)
 37 |         print("\nModel loaded successfully")
 38 | 
 39 |         # Get prompt
 40 |         prompt = input("\nEnter prompt: ")
 41 |         height, width = input("\nEnter height and width of image: ").split()
 42 |         height = int(height)
 43 |         width = int(width)
 44 | 
 45 |         # Convert height and width to multiple of 64 for model.
 46 |         height = height - height % 64
 47 |         width = width - width % 64
 48 | 
 49 |         # Generate images
 50 |         images = []
 51 |         if use_limited_mem:
 52 |             prompts = [prompt]
 53 |             for _ in range (1, num_images + 1):
 54 |                 print("\nRunning Text to Image generation...")
 55 |                 with autocast(self.device):
 56 |                     images.append(pipe(prompt=prompts, height=height, width=width).images[0])
 57 |         else:
 58 |             print("\nRunning Text to Image generation...")
 59 |             prompts = [prompt] * num_images
 60 |             images = pipe(prompt=prompts, height=height, width=width).images
 61 | 
 62 |         # Save images
 63 |         if save:
 64 |             print("Saving images...")
 65 |             save_images(images)
 66 | 
 67 |     def ImagetoImage(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True):
 68 |         """ Image to Image function """
 69 | 
 70 |         path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4"
 71 |         local_files_only = self.use_local_model
 72 |         use_auth_token = not local_files_only
 73 | 
 74 |         # Get access token
 75 |         access_token = False
 76 |         if use_auth_token:
 77 |             access_token = input("\nEnter Hugging face user access token: ")
 78 | 
 79 |         # Load the model
 80 |         print("\nLoading model...")
 81 |         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, use_auth_token=access_token,
 82 |                                     local_files_only=local_files_only,
 83 |                                     torch_dtype=torch.float16, revision='fp16')
 84 |         pipe = pipe.to(self.device)
 85 |         print("\nModel loaded successfully")
 86 | 
 87 |         # Get prompt
 88 |         image_path = input("\nEnter initial image path: ")
 89 |         prompt = input("\nEnter prompt: ")
 90 |         strength = float(input("\nEnter strength in [0, 1] range: "))
 91 |         if not 0 <= strength <= 1:
 92 |             raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength))
 93 | 
 94 |         init_image = Image.open(image_path).convert("RGB")
 95 |         width, height = init_image.size
 96 | 
 97 |         # Convert height and width to multiple of 64 for model.
 98 |         width = width - width % 64
 99 |         height = height - height % 64
100 |         init_image = init_image.resize((width, height))
101 | 
102 |         # Generate images
103 |         images = []
104 |         if use_limited_mem:
105 |             prompts = [prompt]
106 |             for _ in range (1, num_images + 1):
107 |                 print("\nRunning Image to Image generation...")
108 |                 with autocast(self.device):
109 |                     images.append(pipe(prompt=prompts,
110 |                                         init_image=init_image,
111 |                                         strength=strength).images[0])
112 |         else:
113 |             print("\nRunning Image to Image generation...")
114 |             prompts = [prompt] * num_images
115 |             images = pipe(prompt=prompts,
116 |                             init_image=init_image,
117 |                             strength=strength).images
118 | 
119 |         # Save images
120 |         if save:
121 |             print("Saving images...")
122 |             save_images(images)
123 | 
124 |     def Inpaint(self, num_images: int = 1, save: bool = True, use_limited_mem: bool = True):
125 |         """ Inpaint function """
126 | 
127 |         path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4"
128 |         local_files_only = self.use_local_model
129 |         use_auth_token = not local_files_only
130 | 
131 |         # Get access token
132 |         access_token = False
133 |         if use_auth_token:
134 |             access_token = input("\nEnter Hugging face user access token: ")
135 | 
136 |         # Load the model
137 |         print("\nLoading model...")
138 |         pipe = StableDiffusionInpaintPipeline.from_pretrained(path, use_auth_token=access_token,
139 |                                     local_files_only=local_files_only,
140 |                                     torch_dtype=torch.float16, revision='fp16')
141 |         pipe = pipe.to(self.device)
142 |         print("\nModel loaded successfully")
143 | 
144 |         # Get prompt
145 |         image_path = input("\nEnter initial image path: ")
146 |         mask_path = input("\nEnter mask image path: ")
147 |         prompt = input("\nEnter prompt: ")
148 |         strength = float(input("\nEnter strength in [0, 1] range: "))
149 |         if not 0 <= strength <= 1:
150 |             raise ValueError("{} is an invalid strength value. Enter strength in [0, 1] range.".format(strength))
151 | 
152 |         init_image = Image.open(image_path).convert("RGB")
153 |         mask_image = Image.open(mask_path).convert("RGB")
154 |         image_width, image_height = init_image.size
155 |         mask_width, mask_height = mask_image.size
156 | 
157 |         if (not image_width == mask_width) or (not image_height == mask_height):
158 |             raise ValueError("Init image size must match mask image size.")
159 | 
160 |         # Convert height and width to multiple of 64 for model.
161 |         image_width = image_width - image_width % 64
162 |         image_height = image_height - image_height % 64
163 |         init_image = init_image.resize((image_width, image_height))
164 |         mask_image = mask_image.resize((image_width, image_height))
165 | 
166 |         # Generate images
167 |         images = []
168 |         if use_limited_mem:
169 |             prompts = [prompt]
170 |             for _ in range (1, num_images + 1):
171 |                 print("\nRunning Inpaint...")
172 |                 with autocast(self.device):
173 |                     images.append(pipe(prompt=prompts,
174 |                                         init_image=init_image,
175 |                                         mask_image=mask_image,
176 |                                         strength=strength).images[0])
177 |         else:
178 |             print("\nRunning Inpaint...")
179 |             prompts = [prompt] * num_images
180 |             images = pipe(prompt=prompts,
181 |                             init_image=init_image,
182 |                             mask_image=mask_image,
183 |                             strength=strength).images
184 | 
185 |         # Save images
186 |         if save:
187 |             print("Saving images...")
188 |             save_images(images)
189 | 
190 |     def Dream(self, num_images: int = 1, save: bool = True):
191 |         """ Dream function """
192 | 
193 |         path = "./stable-diffusion-v1-4" if self.use_local_model else "CompVis/stable-diffusion-v1-4"
194 |         local_files_only = self.use_local_model
195 |         use_auth_token = not local_files_only
196 | 
197 |         # Get access token
198 |         access_token = False
199 |         if use_auth_token:
200 |             access_token = input("\nEnter Hugging face user access token: ")
201 | 
202 |         # Load the model
203 |         print("\nLoading model...")
204 |         lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
205 |         pipe = StableDiffusionPipeline.from_pretrained(path, use_auth_token=access_token,
206 |                                     local_files_only=local_files_only, scheduler=lms,
207 |                                     torch_dtype=torch.float16, revision='fp16')
208 |         pipe = pipe.to(self.device)
209 |         print("\nModel loaded successfully")
210 | 
211 |         # Get prompt
212 |         prompt = input("\nEnter prompt: ")
213 |         height, width = input("\nEnter height and width of image: ").split()
214 |         height = int(height)
215 |         width = int(width)
216 | 
217 |         # Convert height and width to multiple of 64 for model.
218 |         height = height - height % 64
219 |         width = width - width % 64
220 | 
221 |         source_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device)
222 |         target_latent = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=self.device)
223 | 
224 |         images = []
225 |         print("\nDreaming...")
226 |         for _, t in enumerate(np.linspace(0, 1, num_images)):
227 |             init_latent = slerp(float(t), source_latent, target_latent)
228 | 
229 |             with autocast("cuda"):
230 |                 image = pipe(prompt, latents=init_latent).images[0]
231 |                 if not image.convert("L").getextrema() == (0, 0): # check for black image
232 |                     images.append(image)
233 | 
234 |          # Save images and video
235 |         if save:
236 |             print("Saving images...")
237 |             save_images(images)
238 |             print("Saving video...")
239 |             save_video(images, width, height)
240 | 
241 |     def Animate(self, save: bool = True):
242 |         """ Animate function """
243 |         print("\nUsing configurations from animation_mode/config.py")
244 |         animate(self.use_local_model, save)
245 |         


--------------------------------------------------------------------------------
/animation_mode/animation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | import sys
  4 | from types import SimpleNamespace
  5 | import cv2
  6 | import pandas as pd
  7 | import numpy as np
  8 | from pytorch_lightning import seed_everything
  9 | import torch
 10 | from torch import autocast
 11 | from torchvision import transforms
 12 | from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
 13 | from PIL import Image
 14 | 
 15 | sys.path.extend([
 16 |     './animation_mode/src/taming-transformers',
 17 |     './animation_mode/src/clip',
 18 |     './animation_mode/stable-diffusion/',
 19 |     './animation_mode/k-diffusion',
 20 |     './animation_mode/AdaBins',
 21 |     './animation_mode/MiDaS',
 22 |     './animation_mode',
 23 | ])
 24 | 
 25 | import config
 26 | from .utility.utils import *
 27 | from helpers import DepthModel
 28 | 
 29 | 
 30 | def generate(pipe,
 31 |             prompt,
 32 |             height,
 33 |             width,
 34 |             strength,
 35 |             seed,
 36 |             use_init,
 37 |             init_image,
 38 |             return_sample=False):
 39 |     """ Image generator """
 40 |     seed_everything(seed)
 41 |     device = "cuda"
 42 |     convert_tensor = transforms.ToTensor()
 43 | 
 44 |     results = []
 45 |     if use_init:
 46 |         with autocast(device):
 47 |             with torch.no_grad():
 48 |                 image = pipe(prompt=prompt,
 49 |                             init_image=init_image,
 50 |                             strength=strength,
 51 |                             guidance_scale=config.guidance_scale,
 52 |                             num_inference_steps=config.num_inference_steps).images[0]
 53 |                 torch.cuda.empty_cache()
 54 |     else:
 55 |         with autocast(device):
 56 |             with torch.no_grad():
 57 |                 image = pipe(prompt=prompt, height=height, width=width).images[0]
 58 |                 torch.cuda.empty_cache()
 59 | 
 60 |     if return_sample:
 61 |         samples = convert_tensor(image)
 62 |         results.append(samples)
 63 |     results.append(image)
 64 | 
 65 |     return results
 66 | 
 67 | 
 68 | def render_input_video(pipe_txt2img, pipe_img2img):
 69 |     """ Function for animate video """
 70 |     # create a folder for the video input frames to live in
 71 |     video_in_frame_path = os.path.join(os.getcwd(), 'inputframes')
 72 |     os.makedirs(video_in_frame_path, exist_ok=True)
 73 | 
 74 |     # save the video frames from input video
 75 |     print(f"Exporting Video Frames from (1 every {config.extract_nth_frame}) \
 76 |             frames to {video_in_frame_path}...")
 77 |     try:
 78 |         for f in pathlib.Path(video_in_frame_path).glob('*.png'):
 79 |             f.unlink()
 80 |     except:
 81 |         pass
 82 |     cap = cv2.VideoCapture(config.video_init_path)
 83 |     success, image = cap.read()
 84 |     count = 0
 85 |     while success:
 86 |         file_name = "inputframes/frame_" + str(count) + ".png"
 87 |         cv2.imwrite(file_name, image)
 88 |         success,image = cap.read()
 89 |         count = count + 1 + (config.extract_nth_frame - 1)
 90 |         if config.max_frames is not None and count > config.max_frames:
 91 |             break
 92 |     cap.release()
 93 | 
 94 |     # determine max frames from length of input frames
 95 |     num_frames = len([f for f in pathlib.Path(video_in_frame_path).glob('*.png')])
 96 | 
 97 |     print(f"Loading {num_frames} input frames from {video_in_frame_path} \
 98 |                 and saving video frames to {video_in_frame_path}")
 99 |     render_animation(pipe_txt2img, pipe_img2img)
100 | 
101 | 
102 | def render_animation(pipe_txt2img, pipe_img2img):
103 |     """ Function for animate 2D, animate 3D """
104 |     device = "cuda"
105 |     W, H = (config.width, config.height)
106 |     depth_model = None
107 |     models_path = "./models"
108 |     init_image = None
109 |     video_width, video_height = None, None
110 | 
111 |     angle_series = get_inbetweens(parse_key_frames(config.angle))
112 |     zoom_series = get_inbetweens(parse_key_frames(config.zoom))
113 |     translation_x_series = get_inbetweens(parse_key_frames(config.translation_x))
114 |     translation_y_series = get_inbetweens(parse_key_frames(config.translation_y))
115 |     translation_z_series = get_inbetweens(parse_key_frames(config.translation_z))
116 |     rotation_3d_x_series = get_inbetweens(parse_key_frames(config.rotation_3d_x))
117 |     rotation_3d_y_series = get_inbetweens(parse_key_frames(config.rotation_3d_y))
118 |     rotation_3d_z_series = get_inbetweens(parse_key_frames(config.rotation_3d_z))
119 |     strength_schedule_series = get_inbetweens(parse_key_frames(config.strength_schedule))
120 |     midas_weight_dict = {"midas_weight":config.midas_weight}
121 |     anim_args = SimpleNamespace(**midas_weight_dict)
122 | 
123 |     start_frame = 0
124 |     outdir = os.path.join(os.getcwd(), r'images')
125 |     if not os.path.exists(outdir):
126 |         os.makedirs(outdir)
127 |     print(f"\nSaving animation frames to {outdir}")
128 | 
129 |     # check for video inits
130 |     using_vid_init = config.animation_mode == 'Video Input'
131 |     use_init = using_vid_init
132 | 
133 |     max_frames = config.max_frames
134 |     if using_vid_init:
135 |         max_frames = len([f for f in pathlib.Path(os.path.join(os.getcwd(), 'inputframes')).glob('*.png')])
136 |         cap = cv2.VideoCapture(config.video_init_path)
137 |         video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
138 |         video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
139 |         cap.release()
140 | 
141 |     # expand prompts out to per-frame
142 |     prompt_series = pd.Series([np.nan for a in range(max_frames)])
143 |     for i, prompt in config.animation_prompts.items():
144 |         prompt_series[int(i)] = prompt
145 |     prompt_series = prompt_series.ffill().bfill()
146 | 
147 |     # load depth model for 3D
148 |     predict_depths = (config.animation_mode == '3D' and config.use_depth_warping) or config.save_depth_maps
149 |     if predict_depths:
150 |         depth_model = DepthModel("cpu")
151 |         depth_model.load_midas(models_path)
152 |         if config.midas_weight < 1.0:
153 |             depth_model.load_adabins()
154 |     else:
155 |         depth_model = None
156 |         config.save_depth_maps = False
157 | 
158 |     turbo_steps = 1 if using_vid_init else int(config.diffusion_cadence)
159 |     turbo_prev_image, turbo_prev_frame_idx = None, 0
160 |     turbo_next_image, turbo_next_frame_idx = None, 0
161 | 
162 |     # resume animation
163 |     prev_sample = None
164 |     color_match_sample = None
165 |     frame_idx = start_frame
166 | 
167 |     seed = config.seed
168 |     while frame_idx < max_frames:
169 |         print(f"\nRendering animation frame {frame_idx} of {max_frames}")
170 |         strength = strength_schedule_series[frame_idx]
171 |         strength = max(0.0, min(1.0, strength))
172 |         depth = None
173 | 
174 |         # emit in-between frames
175 |         if turbo_steps > 1:
176 |             tween_frame_start_idx = max(0, frame_idx-turbo_steps)
177 |             for tween_frame_idx in range(tween_frame_start_idx, frame_idx):
178 |                 tween = float(tween_frame_idx - tween_frame_start_idx + 1) / float(frame_idx - tween_frame_start_idx)
179 |                 print(f"creating in between frame {tween_frame_idx} tween:{tween:0.2f}")
180 | 
181 |                 advance_prev = turbo_prev_image is not None and tween_frame_idx > turbo_prev_frame_idx
182 |                 advance_next = tween_frame_idx > turbo_next_frame_idx
183 | 
184 |                 if depth_model is not None:
185 |                     assert turbo_next_image is not None
186 |                     depth_model.midas_model = depth_model.midas_model.to(device)
187 |                     depth_model.device = device
188 |                     with torch.no_grad():
189 |                         depth = depth_model.predict(turbo_next_image, anim_args).cpu()
190 |                         torch.cuda.empty_cache()
191 |                     depth_model.midas_model = depth_model.midas_model.to("cpu")
192 |                     depth_model.device = "cpu"
193 | 
194 |                 if config.animation_mode == '2D':
195 |                     if advance_prev:
196 |                         turbo_prev_image = anim_frame_warp_2d(turbo_prev_image, W, H, angle_series, zoom_series, \
197 |                                             translation_x_series, translation_y_series, tween_frame_idx)
198 |                     if advance_next:
199 |                         turbo_next_image = anim_frame_warp_2d(turbo_next_image, W, H, angle_series, zoom_series, \
200 |                                             translation_x_series, translation_y_series, tween_frame_idx)
201 |                 else: # '3D'
202 |                     if advance_prev:
203 |                         turbo_prev_image = anim_frame_warp_3d(turbo_prev_image,
204 |                                                                 depth,
205 |                                                                 translation_x_series,
206 |                                                                 translation_y_series,
207 |                                                                 translation_z_series,
208 |                                                                 rotation_3d_x_series,
209 |                                                                 rotation_3d_y_series,
210 |                                                                 rotation_3d_z_series,
211 |                                                                 config.near_plane,
212 |                                                                 config.far_plane,
213 |                                                                 config.fov,
214 |                                                                 config.sampling_mode,
215 |                                                                 config.padding_mode,
216 |                                                                 tween_frame_idx)
217 |                     if advance_next:
218 |                         turbo_next_image = anim_frame_warp_3d(turbo_next_image,
219 |                                                                 depth,
220 |                                                                 translation_x_series,
221 |                                                                 translation_y_series,
222 |                                                                 translation_z_series,
223 |                                                                 rotation_3d_x_series,
224 |                                                                 rotation_3d_y_series,
225 |                                                                 rotation_3d_z_series,
226 |                                                                 config.near_plane,
227 |                                                                 config.far_plane,
228 |                                                                 config.fov,
229 |                                                                 config.sampling_mode,
230 |                                                                 config.padding_mode,
231 |                                                                 tween_frame_idx)
232 | 
233 |                 turbo_prev_frame_idx = turbo_next_frame_idx = tween_frame_idx
234 | 
235 |                 if turbo_prev_image is not None and tween < 1.0:
236 |                     img = turbo_prev_image*(1.0-tween) + turbo_next_image*tween
237 |                 else:
238 |                     img = turbo_next_image
239 | 
240 |                 # apply color matching
241 |                 if config.color_coherence != 'None':
242 |                     if color_match_sample is not None:
243 |                         img = maintain_colors(img, color_match_sample, config.color_coherence)
244 | 
245 |                 # smoothen image
246 |                 if config.smooth != 'None':
247 |                     img = smoothen_image(Image.fromarray(img.astype(np.uint8)), config.smooth)
248 |                     img = np.array(img)
249 | 
250 |                 init_image = Image.fromarray(img.astype(np.uint8))
251 |                 filename = f"frame_{tween_frame_idx}.png"
252 |                 cv2.imwrite(os.path.join(outdir, filename), cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR))
253 |                 if config.save_depth_maps:
254 |                     depth_model.save(os.path.join(outdir, f"depth_{tween_frame_idx:05}.png"), depth)
255 |             if turbo_next_image is not None:
256 |                 prev_sample = turbo_next_image
257 | 
258 |         # apply transforms to previous frame
259 |         if prev_sample is not None:
260 |             if config.animation_mode == '2D':
261 |                 prev_img = anim_frame_warp_2d(prev_sample, W, H, angle_series, zoom_series, \
262 |                                 translation_x_series, translation_y_series, frame_idx)
263 |             else: # '3D'
264 |                 prev_img_cv2 = prev_sample
265 |                 depth_model.midas_model = depth_model.midas_model.to(device)
266 |                 depth_model.device = device
267 |                 with torch.no_grad():
268 |                     depth = depth_model.predict(prev_img_cv2, anim_args).cpu() if depth_model else None
269 |                     torch.cuda.empty_cache()
270 |                 depth_model.midas_model = depth_model.midas_model.to("cpu")
271 |                 depth_model.device = "cpu"
272 |                 prev_img = anim_frame_warp_3d(prev_img_cv2,
273 |                                                 depth,
274 |                                                 translation_x_series,
275 |                                                 translation_y_series,
276 |                                                 translation_z_series,
277 |                                                 rotation_3d_x_series,
278 |                                                 rotation_3d_y_series,
279 |                                                 rotation_3d_z_series,
280 |                                                 config.near_plane,
281 |                                                 config.far_plane,
282 |                                                 config.fov,
283 |                                                 config.sampling_mode,
284 |                                                 config.padding_mode,
285 |                                                 frame_idx)
286 | 
287 |             if config.color_coherence != 'None':
288 |                 if color_match_sample is None:
289 |                     color_match_sample = prev_img.copy()
290 | 
291 |             use_init = True
292 | 
293 |         # grab prompt for current frame
294 |         prompt = prompt_series[frame_idx]
295 |         print(f"\nSeed: {seed}\nPrompt: {prompt} \n")
296 | 
297 |         # grab init image for current frame
298 |         if using_vid_init:
299 |             init_frame = "./inputframes/" + "frame_" + str(frame_idx) + ".png"
300 |             print(f"\nUsing video init frame {init_frame}")
301 |             try:
302 |                 init_image = load_img(init_frame, (config.width, config.height))
303 |             except:
304 |                 frame_idx += 1
305 |                 continue
306 | 
307 |         # sample the diffusion model
308 |         torch.cuda.empty_cache()
309 |         if use_init:
310 |             pipe_img2img = pipe_img2img.to(device)
311 |             sample, image = generate(pipe_img2img, prompt, H, W, \
312 |                         strength, seed, use_init, init_image, return_sample=True)
313 |             pipe_img2img.to("cpu")
314 |         else:
315 |             pipe_txt2img = pipe_txt2img.to(device)
316 |             sample, image = generate(pipe_txt2img, prompt, H, W, \
317 |                         strength, seed, use_init, init_image, return_sample=True)
318 |             pipe_txt2img.to("cpu")
319 | 
320 |         torch.cuda.empty_cache()
321 |         if not using_vid_init:
322 |             prev_sample = sample
323 | 
324 |         if turbo_steps > 1:
325 |             turbo_prev_image, turbo_prev_frame_idx = turbo_next_image, turbo_next_frame_idx
326 |             turbo_next_image, turbo_next_frame_idx = sample_to_cv2(sample, type=np.float32), frame_idx
327 |             frame_idx += turbo_steps
328 |         else:
329 |             filename = f"frame_{frame_idx}.png"
330 |             if using_vid_init and config.video_same_size:
331 |                 image = image.resize((video_width, video_height), resample=Image.LANCZOS)
332 |             if not image.convert("L").getextrema() == (0, 0): # check for black image
333 |                 image.save(os.path.join(outdir, filename))
334 |             if config.save_depth_maps:
335 |                 if depth is None:
336 |                     depth = depth_model.predict(sample_to_cv2(sample), anim_args)
337 |                 depth_model.save(os.path.join(outdir, f"depth_{frame_idx:05}.png"), depth)
338 |             frame_idx += 1
339 | 
340 |         seed = next_seed(seed, config.seed_behavior)
341 | 
342 | 
343 | def animate(use_local_model, save):
344 |     """ Top level function for animate 2D, animate 3D, and animate video """
345 |     path = "./stable-diffusion-v1-4" if use_local_model else "CompVis/stable-diffusion-v1-4"
346 |     local_files_only = use_local_model
347 |     use_auth_token = not local_files_only
348 | 
349 |     # Get access token
350 |     access_token = False
351 |     if use_auth_token:
352 |         access_token = input("\nEnter Hugging face user access token: ")
353 | 
354 |     print(f"\nMax cuda memory reserved before running the app: \
355 |             {torch.cuda.max_memory_reserved(torch.device('cuda'))} bytes\n")
356 |     print("\nLoading Diffusion model...")
357 |     pipe_txt2img = StableDiffusionPipeline.from_pretrained(path,
358 |                                                             use_auth_token=access_token,
359 |                                                             local_files_only=local_files_only,
360 |                                                             torch_dtype=torch.float16,
361 |                                                             revision='fp16')
362 |     pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(path,
363 |                                                             use_auth_token=access_token,
364 |                                                             local_files_only=local_files_only,
365 |                                                             torch_dtype=torch.float16,
366 |                                                             revision='fp16')
367 |     print("\nModel loaded successfully")
368 | 
369 |     if config.animation_mode == '2D' or config.animation_mode == '3D':
370 |         render_animation(pipe_txt2img, pipe_img2img)
371 |     elif config.animation_mode == 'Video Input':
372 |         render_input_video(pipe_txt2img, pipe_img2img)
373 |     else:
374 |         print(f"\nInvalid animation mode {config.animation_mode}. \
375 |                 Supported modes = [2D, 3D, Video Input].")
376 | 
377 |     if save:
378 |         save_video(config.width, config.height)
379 | 


--------------------------------------------------------------------------------