├── .gitignore ├── README.md ├── __init__.py ├── demofusion.py ├── examples ├── Demofusion From Single File Example.json ├── Iterative KSampler Control.json └── Iterative KSampler Example.json ├── images ├── example.png └── example2.png ├── pipeline_demofusion_sdxl.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI Demofusion Custom Node 2 | 3 | ## Introduction 4 | 5 | The Demofusion Custom Node is a wrapper that adapts the work and implementation of the Demofusion technique created and implemented by Ruoyi Du to the Comfyui environment. 6 | 7 | The original paper and official implementations by Ruoyi Du can be found here : 8 | - https://ruoyidu.github.io/demofusion/demofusion.html 9 | - https://github.com/PRIS-CV/DemoFusion 10 | 11 | My idea was wrapping Demofusion in a node to be used in ComfyUI, so we could use this amazing tool to experimenting with this technique. 12 | 13 | As it it my first ComfyUI custom node, I'm not sure if I'm implementing the best practices, so any comments and/or sugestions are welcome. 14 | 15 | **Update:** Now you can also use local SDXL checkpoints! 16 | 17 | If anyone have some ideas about how to do it, again, thank you very much for yor collaboration and tips. 18 | 19 | ## Installing 20 | To install this node, is just like any other one, no special procedures are needed: 21 | - Git clone the repository in the ComfyUI/custom_nodes folder 22 | - Restart ComfyUI 23 | 24 | It's also good to remember that this technique requires a lot of VRAM (plus 18G). 25 | 26 | ## How to use 27 | Here you can see an example of how to use the node 28 | ![example](./images/example.png) 29 | 30 | And here it's the generated image (that can also works as the basic worflow, if you drag and drop it in the Comfyui browser window): 31 | 32 | ![example_2](./images/example2.png) 33 | 34 | So far I've only tried to generate square images with 2048x2048 and 3072x3072. 35 | 36 | I've also only tried these Huggingface models: 37 | - stabilityai/stable-diffusion-xl-base-1.0 38 | - stablediffusionapi/sdxl-unstable-diffusers-y 39 | 40 | Being the first one the model used by the author of the technique in his paper and implementation. 41 | 42 | ## Under development 43 | This node is under development, so use it at your own risk. And probably the interface will change a lot, impacting the generated workflows in the future versions. 44 | 45 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import threading 4 | import sys 5 | import locale 6 | import traceback 7 | 8 | 9 | def handle_stream(stream, prefix): 10 | stream.reconfigure(encoding=locale.getpreferredencoding(), errors='replace') 11 | for msg in stream: 12 | if prefix == '[!]' and ('it/s]' in msg or 's/it]' in msg) and ('%|' in msg or 'it [' in msg): 13 | if msg.startswith('100%'): 14 | print('\r' + msg, end="", file=sys.stderr), 15 | else: 16 | print('\r' + msg[:-1], end="", file=sys.stderr), 17 | else: 18 | if prefix == '[!]': 19 | print(prefix, msg, end="", file=sys.stderr) 20 | else: 21 | print(prefix, msg, end="") 22 | 23 | def run_script(cmd, cwd='.'): 24 | if len(cmd) > 0 and cmd[0].startswith("#"): 25 | print(f"[ComfyUI-Manager] Unexpected behavior: `{cmd}`") 26 | return 0 27 | 28 | process = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1) 29 | 30 | stdout_thread = threading.Thread(target=handle_stream, args=(process.stdout, "")) 31 | stderr_thread = threading.Thread(target=handle_stream, args=(process.stderr, "[!]")) 32 | 33 | stdout_thread.start() 34 | stderr_thread.start() 35 | 36 | stdout_thread.join() 37 | stderr_thread.join() 38 | 39 | return process.wait() 40 | 41 | try: 42 | from .demofusion import NODE_CLASS_MAPPINGS 43 | except: 44 | my_path = os.path.dirname(__file__) 45 | requirements_path = os.path.join(my_path, "requirements.txt") 46 | 47 | print(f"## Demofusion: installing dependencies") 48 | 49 | run_script([sys.executable, '-s', '-m', 'pip', 'install', '-r', requirements_path]) 50 | 51 | try: 52 | from .demofusion import NODE_CLASS_MAPPINGS 53 | except: 54 | print(f"## [ERROR] Demofusion: Attempting to reinstall dependencies using an alternative method.") 55 | run_script([sys.executable, '-s', '-m', 'pip', 'install', '--user', '-r', requirements_path]) 56 | 57 | try: 58 | from .demofusion import NODE_CLASS_MAPPINGS 59 | except: 60 | print(f"## [ERROR] Demofusion: Failed to install the GitPython package in the correct Python environment. Please install it manually in the appropriate environment. (You can seek help at https://app.element.io/#/room/%23comfyui_space%3Amatrix.org)") 61 | traceback.print_exc() 62 | 63 | print(f"## Demofusion: installing dependencies done.") 64 | 65 | 66 | __all__ = ['NODE_CLASS_MAPPINGS'] 67 | 68 | -------------------------------------------------------------------------------- /demofusion.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import sys 4 | import torch 5 | from PIL import Image 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import comfy.model_management 9 | import comfy.sample 10 | import comfy.utils 11 | import comfy.samplers 12 | import logging as logger 13 | 14 | my_dir = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | sys.path.append(my_dir) 17 | ## Have to change the original name to contain the word "StableDiffusion" because of: 18 | ## https://github.com/huggingface/diffusers/blob/2d94c7838e273c40920ffd6d24d724357add7f2d/src/diffusers/loaders/single_file.py#L207C15-L207C30 19 | from pipeline_demofusion_sdxl import DemoFusionSDXLStableDiffusionPipeline 20 | sys.path.remove(my_dir) 21 | 22 | custom_nodes_dir = os.path.abspath(os.path.join(my_dir, '..')) 23 | comfy_dir = os.path.abspath(os.path.join(my_dir, '..', '..')) 24 | sys.path.append(comfy_dir) 25 | import folder_paths 26 | sys.path.remove(comfy_dir) 27 | 28 | 29 | class Demofusion: 30 | def __init__(self): 31 | pass 32 | 33 | @classmethod 34 | def INPUT_TYPES(s): 35 | return { 36 | "required": { 37 | "ckpt_name": ("STRING", { 38 | "multiline": False, 39 | "default": "stabilityai/stable-diffusion-xl-base-1.0" 40 | }), 41 | "positive": ("STRING", { 42 | "multiline": True, 43 | "default": "" 44 | }), 45 | "negative": ("STRING", { 46 | "multiline": True, 47 | "default": "" 48 | }), 49 | "width": ("INT", { 50 | "default": 2048, 51 | "min": 2048, #Minimum value 52 | "max": 4096, #Maximum value 53 | "step": 64, #Slider's step 54 | "display": "number" 55 | }), 56 | "height": ("INT", { 57 | "default": 2048, 58 | "min": 2048, #Minimum value 59 | "max": 4096, #Maximum value 60 | "step": 64, #Slider's step 61 | "display": "number" 62 | }), 63 | "inference_steps": ("INT", { 64 | "default": 40, 65 | "min": 1, #Minimum value 66 | "max": 100, #Maximum value 67 | "step": 1, #Slider's step 68 | "display": "number" 69 | }), 70 | "cfg": ("FLOAT", { 71 | "default": 7.5, 72 | "min": 1.0, 73 | "max": 20.0, 74 | "step": 0.5, 75 | "round": 0.001, 76 | "display": "number"}), 77 | "seed": ("INT", { 78 | "default": 522, 79 | "display": "number" 80 | }), 81 | }, 82 | } 83 | 84 | RETURN_TYPES = ("IMAGE",) 85 | FUNCTION = "execute" 86 | CATEGORY = "tests" 87 | 88 | def execute(self, ckpt_name, positive, negative, width, height, inference_steps, cfg, seed): 89 | pipe = DemoFusionSDXLStableDiffusionPipeline.from_pretrained(ckpt_name, torch_dtype=torch.float16) 90 | pipe = pipe.to("cuda") 91 | 92 | generator = torch.Generator(device='cuda') 93 | generator = generator.manual_seed(seed) 94 | 95 | images = pipe(str(positive), negative_prompt=str(negative), 96 | height=height, width=width, view_batch_size=4, stride=64, 97 | num_inference_steps=inference_steps, guidance_scale=cfg, 98 | cosine_scale_1=3, cosine_scale_2=1, cosine_scale_3=1, sigma=0.8, 99 | multi_decoder=True, show_image=False 100 | ) 101 | image=images[len(images)-1] 102 | image = image.convert("RGB") 103 | image = np.array(image).astype(np.float32) / 255.0 104 | image = torch.from_numpy(image)[None,] 105 | 106 | return (image,) 107 | 108 | class DemofusionFromSingleFile: 109 | def __init__(self): 110 | pass 111 | 112 | @classmethod 113 | def INPUT_TYPES(s): 114 | return { 115 | "required": { 116 | "ckpt_name": (folder_paths.get_filename_list("checkpoints"), ), 117 | "positive": ("STRING", { 118 | "multiline": True, 119 | "default": "" 120 | }), 121 | "negative": ("STRING", { 122 | "multiline": True, 123 | "default": "" 124 | }), 125 | "width": ("INT", { 126 | "default": 2048, 127 | "min": 2048, #Minimum value 128 | "max": 4096, #Maximum value 129 | "step": 64, #Slider's step 130 | "display": "number" 131 | }), 132 | "height": ("INT", { 133 | "default": 2048, 134 | "min": 2048, #Minimum value 135 | "max": 4096, #Maximum value 136 | "step": 64, #Slider's step 137 | "display": "number" 138 | }), 139 | "inference_steps": ("INT", { 140 | "default": 40, 141 | "min": 1, #Minimum value 142 | "max": 100, #Maximum value 143 | "step": 1, #Slider's step 144 | "display": "number" 145 | }), 146 | "cfg": ("FLOAT", { 147 | "default": 7.5, 148 | "min": 1.0, 149 | "max": 20.0, 150 | "step": 0.5, 151 | "round": 0.001, 152 | "display": "number"}), 153 | "seed": ("INT", { 154 | "default": 522, 155 | "display": "number" 156 | }), 157 | }, 158 | } 159 | 160 | RETURN_TYPES = ("IMAGE",) 161 | FUNCTION = "execute" 162 | CATEGORY = "tests" 163 | 164 | def execute(self, ckpt_name, positive, negative, width, height, inference_steps, cfg, seed): 165 | ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name) 166 | pipe = DemoFusionSDXLStableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16, use_safetensors=True) 167 | pipe = pipe.to("cuda") 168 | 169 | generator = torch.Generator(device='cuda') 170 | generator = generator.manual_seed(seed) 171 | 172 | images = pipe(str(positive), negative_prompt=str(negative), 173 | height=height, width=width, view_batch_size=4, stride=64, 174 | num_inference_steps=inference_steps, guidance_scale=cfg, 175 | cosine_scale_1=3, cosine_scale_2=1, cosine_scale_3=1, sigma=0.8, 176 | multi_decoder=True, show_image=False 177 | ) 178 | image=images[len(images)-1] 179 | image = image.convert("RGB") 180 | image = np.array(image).astype(np.float32) / 255.0 181 | image = torch.from_numpy(image)[None,] 182 | 183 | return (image,) 184 | 185 | 186 | def generate_noised_latents(x, sigmas): 187 | """ 188 | Generate all noised latents for a given initial latent image and sigmas in parallel. 189 | 190 | :param x: Original latent image as a PyTorch tensor. 191 | :param sigmas: Array of sigma values for each timestep as a PyTorch tensor. 192 | :return: A tensor containing all noised latents for each timestep. 193 | """ 194 | # Ensure that x and sigmas are on the same device (e.g., CPU or CUDA) 195 | device = x.device 196 | sigmas = sigmas[1:].to(device) # ignore the first sigma 197 | batch_size = x.shape[0] 198 | num_sigmas = len(sigmas) 199 | 200 | # Expand x and sigmas to match each other in the first dimension 201 | # x_expanded shape will be: 202 | # [batch_size * num_sigmas, channels, height, width] 203 | x_expanded = x.repeat(num_sigmas, 1, 1, 1) 204 | sigmas_expanded = sigmas.repeat_interleave(batch_size) 205 | 206 | logger.warning(f"sigmas: {sigmas.view(-1)}") 207 | 208 | # Create a noise tensor with the same shape as x_expanded 209 | noise = torch.randn_like(x_expanded) 210 | 211 | logger.warning(f"noise: {noise.shape}") 212 | logger.warning(f"x: {x.shape}") 213 | 214 | # Multiply noise by sigmas, reshaped for broadcasting 215 | noised_latents = x_expanded + noise * sigmas_expanded.view(-1, 1, 1, 1) 216 | 217 | logger.warning(f"noised_latents: {x.shape}") 218 | 219 | return noised_latents 220 | 221 | 222 | class BatchUnsampler: 223 | @classmethod 224 | def INPUT_TYPES(s): 225 | return {"required": 226 | {"model": ("MODEL",), 227 | "steps": ("INT", {"default": 20, "min": 1, "max": 10000}), 228 | "end_at_step": ("INT", {"default": 0, "min": 0, "max": 10000}), 229 | "step_increment": ("INT", {"default": 1, "min": 1, "max": 10000}), 230 | "cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0}), 231 | "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ), 232 | "scheduler": (comfy.samplers.KSampler.SCHEDULERS, ), 233 | "normalize": (["disable", "enable"], ), 234 | "positive": ("CONDITIONING", ), 235 | "negative": ("CONDITIONING", ), 236 | "latent_image": ("LATENT", ), 237 | }} 238 | 239 | RETURN_TYPES = ("LATENT",) 240 | RETURN_NAMES = ("latent_batch",) 241 | FUNCTION = "unsampler" 242 | 243 | CATEGORY = "tests" 244 | 245 | 246 | def unsampler(self, model, cfg, sampler_name, steps, end_at_step, step_increment, scheduler, normalize, positive, negative, latent_image): 247 | """ 248 | Generate a batch of latents representing each z[i] in the 249 | progressively noised sequence of latents stemming from the 250 | source latent_image, using the model's noising schedule (sigma) 251 | in reverse and applying normal noise at each step in the manner 252 | prescribed by the original latent diffusion paper. 253 | """ 254 | normalize = normalize == "enable" 255 | device = comfy.model_management.get_torch_device() 256 | latent = latent_image 257 | latent_image = latent["samples"] 258 | 259 | batch_of_latents = [] 260 | 261 | end_at_step = min(end_at_step, steps-1) 262 | end_at_step = steps - end_at_step 263 | 264 | noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") 265 | noise_mask = None 266 | if "noise_mask" in latent: 267 | noise_mask = comfy.sample.prepare_mask(latent["noise_mask"], noise, device) 268 | 269 | real_model = model.model 270 | 271 | noise = noise.to(device) 272 | latent_image = latent_image.to(device) 273 | 274 | positive = comfy.sample.convert_cond(positive) 275 | negative = comfy.sample.convert_cond(negative) 276 | 277 | models, inference_memory = comfy.sample.get_additional_models(positive, negative, model.model_dtype()) 278 | 279 | comfy.model_management.load_models_gpu([model] + models, model.memory_required(noise.shape) + inference_memory) 280 | 281 | sampler = comfy.samplers.KSampler(real_model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=1.0, model_options=model.model_options) 282 | 283 | # Flip the sigmas (the sampling schedule) in reverse so that the sampler 284 | # will instead "unsample" the latent, adding noise rather than 285 | # removing noise. I think we add a small epsilon value to prevent 286 | # division by zero, but I'm not really sure. I got this code from 287 | # BlenderNeko's noise node. 288 | sigmas = sampler.sigmas.flip(0) 289 | 290 | z = generate_noised_latents(latent_image, sigmas) 291 | 292 | logger.warning(f"latent_image.shape={latent_image.shape}") 293 | logger.warning(f"z.shape={z.shape}") 294 | 295 | out = {"samples": z} 296 | 297 | comfy.sample.cleanup_additional_models(models) 298 | 299 | return (out,) 300 | 301 | def get_blending_schedule(steps, alpha_1=3.0, step_size=1): 302 | """ 303 | Define a tensor representing the constant c1 from the DemoFusion paper. 304 | """ 305 | # Create a tensor for t ranging from steps to 0. 306 | # In the DemoFusion paper, they use the original stable diffusion 307 | # terminology where de-noising goes from T to 0, but in our case, 308 | # we go from 0 to steps so we have to reverse the time step 309 | # indices. 310 | t = torch.arange(0, steps + step_size, step_size) 311 | t = t.flip([0]) 312 | 313 | # Calculate c1 using the code borrowed from the author's repository. 314 | cosine_factor = 0.5 * (1 + torch.cos(torch.pi * (steps - t) / steps)) 315 | 316 | c1 = cosine_factor ** alpha_1 317 | return c1 318 | 319 | def batched_ksampler(model, seed, cfg, sampler_name, scheduler, step_increment, positive, negative, latent_image_batch, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, c1=None, alpha_1=3.0): 320 | z_primes = latent_image_batch["samples"] 321 | steps = z_primes.shape[0] # batch size 322 | 323 | # Get the blending parameter from the DemoFusion paper. 324 | if c1 is None: 325 | c1 = get_blending_schedule(steps, step_size=step_increment, alpha_1=alpha_1) 326 | 327 | # Move the blending schedule tensor to the same device as our 328 | # latents. 329 | c1 = c1.to(z_primes.device) 330 | 331 | if disable_noise: 332 | noise = torch.zeros(z_primes.size(), dtype=z_primes.dtype, layout=z_primes.layout, device="cpu") 333 | else: 334 | batch_inds = latent_image_batch["batch_index"] if "batch_index" in latent_image_batch else None 335 | noise = comfy.sample.prepare_noise(z_primes, seed, batch_inds) 336 | 337 | noise_mask = None 338 | if "noise_mask" in latent_image_batch: 339 | noise_mask = latent_image_batch["noise_mask"] 340 | 341 | pbar = comfy.utils.ProgressBar(steps) 342 | def callback(step, x0, x, total_steps): 343 | pbar.update_absolute(step + 1, total_steps) 344 | 345 | disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED 346 | 347 | # Assume that the first image in the batch is the most noised, 348 | # representing z_prime[T] from the DemoFusion paper. You may 349 | # need to reverse the batch of latents to ensure that the first 350 | # latent in the batch is the most noised. 351 | 352 | # Set up a tensor to receive the samples from the de-noising process. 353 | # It has the same shape as z_primes and the first element is 354 | # the first element of z_primes. 355 | z_out = torch.zeros_like(z_primes) 356 | z_out = z_out.to(z_primes.device) 357 | z_out[0] = z_primes[0] 358 | z_i = z_primes[0].unsqueeze(0) 359 | 360 | # The paper suggests that we de-noise the image step by step, blending 361 | # in samples from the noised z_hat tensor along the way according to 362 | # a blending schedule given by the c1 tensor. Each successively de-noised 363 | # sample z[i] is given by this formula: 364 | # 365 | # z[i] = denoise(z[i-1]) * (1 - c1[i]) + z_prime[i-1] * c1 366 | # 367 | # Thus we have to do the following: 368 | # a) latent_image contains the z_primes for all steps. 369 | # b) Iterate through all the steps, denoising from z_prime[0] initially. 370 | # c) Blend the denoised z[i] with z_prime[i] to get a new z[i]. 371 | 372 | for i in range(1, steps, step_increment): 373 | # Grab the i-th z_prime and i-th noise tensor from their batches. 374 | # Unsqueezing replaces the batch dimension with 1, so it transforms 375 | # [i, channel, width, height] into [1, channel, width, height] 376 | z_prime_i = z_primes[i].unsqueeze(0) 377 | noise_i = noise[i].unsqueeze(0) 378 | 379 | # The paper tells us to de-noise z[i-1] from step 380 | # T to T-1; in ComfyUI lingo, that means going from 381 | # step i-1 to step i because we iterate in the reverse 382 | # direction. 383 | z_start_step = i - 1 384 | z_last_step = i 385 | z_i_minus_1 = z_out[i-1] 386 | 387 | logger.warning(f'Denoising z[{i}] from {i-1} to {i} (steps={steps})') 388 | 389 | # De-noise z[i-1] from step i-1 to step i. Recall that since we 390 | # start this loop from i=1, z[i-1] is initialized with z_prime[0]. 391 | # After we have the de-noised latent, we will mix it with z_prime[i] 392 | # according to the paper's cosine blending function. The blended 393 | # latent will then become z[i] and we will head to the next iteration. 394 | samples_i = comfy.sample.sample(model, noise_i, steps, cfg, sampler_name, scheduler, positive, negative, z_i_minus_1, 395 | denoise=denoise, disable_noise=disable_noise, start_step=z_start_step, last_step=z_last_step, 396 | force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed) 397 | 398 | # Move samples to the same device as z_prime_i so that we can 399 | # work with them both to mix below. 400 | samples_i = samples_i.to(z_prime_i.device) 401 | 402 | # Find z_hat (as per the paper) by applying the c1 blending schedule 403 | # to the samples and the prior z_prime latent. The paper suggests 404 | # following this formula, which will mix in a declining fraction of 405 | # z_prime as de-noising continues: 406 | # 407 | # z[i] = denoise(z[i-1]) * (1 - c1[i]) + z_prime[i-1] * c1 408 | 409 | c1_i = c1[i] 410 | logger.warning(f'mixing in {c1_i} of the z_prime latent and {1 - c1_i} of the samples') 411 | z_i = c1_i * z_prime_i + (1 - c1_i) * samples_i 412 | 413 | # Append this new latent onto a list; we will concatenate later 414 | # to create a batch tensor for output from the node. 415 | logger.warning(f'z_i has shape {z_i.shape}') 416 | 417 | z_out[i] = z_i 418 | #z_out[i] = samples_i 419 | 420 | logger.warning(f'dimension at output is {z_out.shape}') 421 | out = latent_image_batch.copy() 422 | out["samples"] = z_out 423 | return (out, ) 424 | 425 | 426 | class IterativeMixingKSampler: 427 | """ 428 | Take a batch of latents, z_prime, and progressively de-noise them 429 | step by step from z_prime[0] to z_prime[steps], mixing in a weighted 430 | fraction of z_prime[i] at each step so that de-noising is guided by 431 | the z_prime latents. This batch sampler assumes that the number of steps 432 | is just the length of z_prime, so there is no steps parameter. The parameter 433 | latent_image_batch should come from the Batch Unsampler node. 434 | """ 435 | @classmethod 436 | def INPUT_TYPES(s): 437 | return {"required": 438 | {"model": ("MODEL",), 439 | "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), 440 | "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), 441 | "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ), 442 | "scheduler": (comfy.samplers.KSampler.SCHEDULERS, ), 443 | "step_increment": ("INT", {"default": 1, "min": 1, "max": 10000}), 444 | "positive": ("CONDITIONING", ), 445 | "negative": ("CONDITIONING", ), 446 | "latent_image_batch": ("LATENT", ), 447 | "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), 448 | "alpha_1": ("FLOAT", {"default": 3.0, "min": 0.1, "max": 10.0}), 449 | "reverse_batch": ("BOOLEAN", {"default": True}) 450 | } 451 | } 452 | 453 | RETURN_TYPES = ("LATENT",) 454 | FUNCTION = "sample" 455 | 456 | CATEGORY = "test" 457 | 458 | def sample(self, model, seed, cfg, sampler_name, scheduler, step_increment, positive, negative, latent_image_batch, denoise=1.0, alpha_1=3.0, reverse_batch=True): 459 | # If desired, reverse the latent batch before batch sampling. 460 | # This is important if the supplied batch is from least-to-most noisy, 461 | # as this node expects the reverse. 462 | if reverse_batch: 463 | latent_image_batch["samples"] =\ 464 | torch.flip(latent_image_batch["samples"], [0]) 465 | return batched_ksampler(model, seed, cfg, sampler_name, scheduler, step_increment, positive, negative, latent_image_batch, denoise=denoise, alpha_1=alpha_1) 466 | 467 | 468 | 469 | # A dictionary that contains all nodes you want to export with their names 470 | # NOTE: names should be globally unique 471 | NODE_CLASS_MAPPINGS = { 472 | "Demofusion": Demofusion, 473 | "Demofusion From Single File" : DemofusionFromSingleFile, 474 | "Batch Unsampler": BatchUnsampler, 475 | "Iterative Mixing KSampler": IterativeMixingKSampler 476 | } 477 | 478 | # A dictionary that contains the friendly/humanly readable titles for the nodes 479 | NODE_DISPLAY_NAME_MAPPINGS = { 480 | "Demofusion": "Demofusion", 481 | "Demofusion From Single File": "Demofusion From Single File", 482 | "Batch Unsampler": "Batch Unsampler", 483 | "Iterative Mixing KSampler": "Iterative Mixing KSampler" 484 | } -------------------------------------------------------------------------------- /examples/Demofusion From Single File Example.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 4, 3 | "last_link_id": 3, 4 | "nodes": [ 5 | { 6 | "id": 2, 7 | "type": "PreviewImage", 8 | "pos": [ 9 | 1238, 10 | 289 11 | ], 12 | "size": [ 13 | 555.3333740234375, 14 | 579.888916015625 15 | ], 16 | "flags": {}, 17 | "order": 1, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "images", 22 | "type": "IMAGE", 23 | "link": 3 24 | } 25 | ], 26 | "properties": { 27 | "Node name for S&R": "PreviewImage" 28 | } 29 | }, 30 | { 31 | "id": 4, 32 | "type": "Demofusion From Single File", 33 | "pos": [ 34 | 617, 35 | 282 36 | ], 37 | "size": { 38 | "0": 400, 39 | "1": 294 40 | }, 41 | "flags": {}, 42 | "order": 0, 43 | "mode": 0, 44 | "outputs": [ 45 | { 46 | "name": "IMAGE", 47 | "type": "IMAGE", 48 | "links": [ 49 | 3 50 | ], 51 | "shape": 3, 52 | "slot_index": 0 53 | } 54 | ], 55 | "properties": { 56 | "Node name for S&R": "Demofusion From Single File" 57 | }, 58 | "widgets_values": [ 59 | "SDXL_1.0_MOHAWK_v18.safetensors", 60 | "A photo of a man in a cafe.", 61 | "", 62 | 2048, 63 | 2048, 64 | 30, 65 | 7.5, 66 | 1537, 67 | "randomize" 68 | ] 69 | } 70 | ], 71 | "links": [ 72 | [ 73 | 3, 74 | 4, 75 | 0, 76 | 2, 77 | 0, 78 | "IMAGE" 79 | ] 80 | ], 81 | "groups": [], 82 | "config": {}, 83 | "extra": {}, 84 | "version": 0.4 85 | } -------------------------------------------------------------------------------- /examples/Iterative KSampler Example.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 84, 3 | "last_link_id": 235, 4 | "nodes": [ 5 | { 6 | "id": 23, 7 | "type": "EmptyLatentImage", 8 | "pos": [ 9 | 76.64568822265623, 10 | 295.661130855713 11 | ], 12 | "size": { 13 | "0": 315, 14 | "1": 106 15 | }, 16 | "flags": {}, 17 | "order": 0, 18 | "mode": 0, 19 | "outputs": [ 20 | { 21 | "name": "LATENT", 22 | "type": "LATENT", 23 | "links": [ 24 | 175 25 | ], 26 | "shape": 3 27 | } 28 | ], 29 | "properties": { 30 | "Node name for S&R": "EmptyLatentImage" 31 | }, 32 | "widgets_values": [ 33 | 512, 34 | 512, 35 | 1 36 | ] 37 | }, 38 | { 39 | "id": 2, 40 | "type": "CheckpointLoaderSimple", 41 | "pos": [ 42 | 76.64568822265623, 43 | 531.6611308557129 44 | ], 45 | "size": { 46 | "0": 315, 47 | "1": 98 48 | }, 49 | "flags": {}, 50 | "order": 1, 51 | "mode": 0, 52 | "outputs": [ 53 | { 54 | "name": "MODEL", 55 | "type": "MODEL", 56 | "links": [ 57 | 168 58 | ], 59 | "shape": 3, 60 | "slot_index": 0 61 | }, 62 | { 63 | "name": "CLIP", 64 | "type": "CLIP", 65 | "links": [ 66 | 169 67 | ], 68 | "shape": 3, 69 | "slot_index": 1 70 | }, 71 | { 72 | "name": "VAE", 73 | "type": "VAE", 74 | "links": [ 75 | 177 76 | ], 77 | "shape": 3 78 | } 79 | ], 80 | "properties": { 81 | "Node name for S&R": "CheckpointLoaderSimple" 82 | }, 83 | "widgets_values": [ 84 | "epic-realism.safetensors" 85 | ] 86 | }, 87 | { 88 | "id": 32, 89 | "type": "LoraLoader", 90 | "pos": [ 91 | 491.64568822265636, 92 | 119.66113085571297 93 | ], 94 | "size": { 95 | "0": 315, 96 | "1": 126 97 | }, 98 | "flags": {}, 99 | "order": 3, 100 | "mode": 0, 101 | "inputs": [ 102 | { 103 | "name": "model", 104 | "type": "MODEL", 105 | "link": 168 106 | }, 107 | { 108 | "name": "clip", 109 | "type": "CLIP", 110 | "link": 169 111 | } 112 | ], 113 | "outputs": [ 114 | { 115 | "name": "MODEL", 116 | "type": "MODEL", 117 | "links": [ 118 | 172, 119 | 178, 120 | 179 121 | ], 122 | "shape": 3, 123 | "slot_index": 0 124 | }, 125 | { 126 | "name": "CLIP", 127 | "type": "CLIP", 128 | "links": [ 129 | 170, 130 | 171, 131 | 180 132 | ], 133 | "shape": 3, 134 | "slot_index": 1 135 | } 136 | ], 137 | "properties": { 138 | "Node name for S&R": "LoraLoader" 139 | }, 140 | "widgets_values": [ 141 | "lcm_sd15.safetensors", 142 | 1, 143 | 1 144 | ] 145 | }, 146 | { 147 | "id": 4, 148 | "type": "CLIPTextEncode", 149 | "pos": [ 150 | 500, 151 | 442 152 | ], 153 | "size": [ 154 | 304.0624967578128, 155 | 109.64216953125026 156 | ], 157 | "flags": {}, 158 | "order": 5, 159 | "mode": 0, 160 | "inputs": [ 161 | { 162 | "name": "clip", 163 | "type": "CLIP", 164 | "link": 171 165 | } 166 | ], 167 | "outputs": [ 168 | { 169 | "name": "CONDITIONING", 170 | "type": "CONDITIONING", 171 | "links": [ 172 | 174, 173 | 182 174 | ], 175 | "shape": 3, 176 | "slot_index": 0 177 | } 178 | ], 179 | "properties": { 180 | "Node name for S&R": "CLIPTextEncode" 181 | }, 182 | "widgets_values": [ 183 | "" 184 | ], 185 | "color": "#322", 186 | "bgcolor": "#533" 187 | }, 188 | { 189 | "id": 35, 190 | "type": "ToBasicPipe", 191 | "pos": [ 192 | 974, 193 | 119 194 | ], 195 | "size": { 196 | "0": 241.79998779296875, 197 | "1": 106 198 | }, 199 | "flags": {}, 200 | "order": 7, 201 | "mode": 0, 202 | "inputs": [ 203 | { 204 | "name": "model", 205 | "type": "MODEL", 206 | "link": 179, 207 | "slot_index": 0 208 | }, 209 | { 210 | "name": "clip", 211 | "type": "CLIP", 212 | "link": 180, 213 | "slot_index": 1 214 | }, 215 | { 216 | "name": "vae", 217 | "type": "VAE", 218 | "link": 177, 219 | "slot_index": 2 220 | }, 221 | { 222 | "name": "positive", 223 | "type": "CONDITIONING", 224 | "link": 192, 225 | "slot_index": 3 226 | }, 227 | { 228 | "name": "negative", 229 | "type": "CONDITIONING", 230 | "link": 182, 231 | "slot_index": 4 232 | } 233 | ], 234 | "outputs": [ 235 | { 236 | "name": "basic_pipe", 237 | "type": "BASIC_PIPE", 238 | "links": [ 239 | 186 240 | ], 241 | "shape": 3, 242 | "slot_index": 0 243 | } 244 | ], 245 | "properties": { 246 | "Node name for S&R": "ToBasicPipe" 247 | } 248 | }, 249 | { 250 | "id": 3, 251 | "type": "CLIPTextEncode", 252 | "pos": [ 253 | 495, 254 | 292 255 | ], 256 | "size": [ 257 | 311.6174980468752, 258 | 107.05781835937523 259 | ], 260 | "flags": {}, 261 | "order": 4, 262 | "mode": 0, 263 | "inputs": [ 264 | { 265 | "name": "clip", 266 | "type": "CLIP", 267 | "link": 170 268 | } 269 | ], 270 | "outputs": [ 271 | { 272 | "name": "CONDITIONING", 273 | "type": "CONDITIONING", 274 | "links": [ 275 | 173, 276 | 192 277 | ], 278 | "shape": 3, 279 | "slot_index": 0 280 | } 281 | ], 282 | "properties": { 283 | "Node name for S&R": "CLIPTextEncode" 284 | }, 285 | "widgets_values": [ 286 | "a man in a cafe" 287 | ], 288 | "color": "#232", 289 | "bgcolor": "#353" 290 | }, 291 | { 292 | "id": 60, 293 | "type": "Reroute", 294 | "pos": [ 295 | 1572, 296 | -58 297 | ], 298 | "size": [ 299 | 75, 300 | 26 301 | ], 302 | "flags": {}, 303 | "order": 9, 304 | "mode": 0, 305 | "inputs": [ 306 | { 307 | "name": "", 308 | "type": "*", 309 | "link": 186 310 | } 311 | ], 312 | "outputs": [ 313 | { 314 | "name": "", 315 | "type": "BASIC_PIPE", 316 | "links": [ 317 | 189 318 | ], 319 | "slot_index": 0 320 | } 321 | ], 322 | "properties": { 323 | "showOutputText": false, 324 | "horizontal": false 325 | } 326 | }, 327 | { 328 | "id": 41, 329 | "type": "MiDaS-DepthMapPreprocessor", 330 | "pos": [ 331 | 1813, 332 | 528 333 | ], 334 | "size": { 335 | "0": 315, 336 | "1": 106 337 | }, 338 | "flags": {}, 339 | "order": 15, 340 | "mode": 0, 341 | "inputs": [ 342 | { 343 | "name": "image", 344 | "type": "IMAGE", 345 | "link": 97, 346 | "slot_index": 0 347 | } 348 | ], 349 | "outputs": [ 350 | { 351 | "name": "IMAGE", 352 | "type": "IMAGE", 353 | "links": [ 354 | 98 355 | ], 356 | "shape": 3, 357 | "slot_index": 0 358 | } 359 | ], 360 | "properties": { 361 | "Node name for S&R": "MiDaS-DepthMapPreprocessor" 362 | }, 363 | "widgets_values": [ 364 | 6.283185307179586, 365 | 0.1, 366 | 512 367 | ] 368 | }, 369 | { 370 | "id": 40, 371 | "type": "ControlNetApply", 372 | "pos": [ 373 | 2119, 374 | 331 375 | ], 376 | "size": { 377 | "0": 317.4000244140625, 378 | "1": 98 379 | }, 380 | "flags": {}, 381 | "order": 16, 382 | "mode": 0, 383 | "inputs": [ 384 | { 385 | "name": "conditioning", 386 | "type": "CONDITIONING", 387 | "link": 190 388 | }, 389 | { 390 | "name": "control_net", 391 | "type": "CONTROL_NET", 392 | "link": 99, 393 | "slot_index": 1 394 | }, 395 | { 396 | "name": "image", 397 | "type": "IMAGE", 398 | "link": 98 399 | } 400 | ], 401 | "outputs": [ 402 | { 403 | "name": "CONDITIONING", 404 | "type": "CONDITIONING", 405 | "links": [ 406 | 191 407 | ], 408 | "shape": 3, 409 | "slot_index": 0 410 | } 411 | ], 412 | "properties": { 413 | "Node name for S&R": "ControlNetApply" 414 | }, 415 | "widgets_values": [ 416 | 0.25 417 | ] 418 | }, 419 | { 420 | "id": 42, 421 | "type": "ControlNetLoader", 422 | "pos": [ 423 | 2130, 424 | 135 425 | ], 426 | "size": { 427 | "0": 315, 428 | "1": 58 429 | }, 430 | "flags": {}, 431 | "order": 2, 432 | "mode": 0, 433 | "outputs": [ 434 | { 435 | "name": "CONTROL_NET", 436 | "type": "CONTROL_NET", 437 | "links": [ 438 | 99 439 | ], 440 | "shape": 3 441 | } 442 | ], 443 | "properties": { 444 | "Node name for S&R": "ControlNetLoader" 445 | }, 446 | "widgets_values": [ 447 | "control_sd15_depth.pth" 448 | ] 449 | }, 450 | { 451 | "id": 22, 452 | "type": "KSampler", 453 | "pos": [ 454 | 1406.6456882226562, 455 | 119.66113085571297 456 | ], 457 | "size": { 458 | "0": 315, 459 | "1": 474 460 | }, 461 | "flags": {}, 462 | "order": 6, 463 | "mode": 0, 464 | "inputs": [ 465 | { 466 | "name": "model", 467 | "type": "MODEL", 468 | "link": 172, 469 | "slot_index": 0 470 | }, 471 | { 472 | "name": "positive", 473 | "type": "CONDITIONING", 474 | "link": 173, 475 | "slot_index": 1 476 | }, 477 | { 478 | "name": "negative", 479 | "type": "CONDITIONING", 480 | "link": 174, 481 | "slot_index": 2 482 | }, 483 | { 484 | "name": "latent_image", 485 | "type": "LATENT", 486 | "link": 175, 487 | "slot_index": 3 488 | } 489 | ], 490 | "outputs": [ 491 | { 492 | "name": "LATENT", 493 | "type": "LATENT", 494 | "links": [ 495 | 183, 496 | 193 497 | ], 498 | "shape": 3, 499 | "slot_index": 0 500 | } 501 | ], 502 | "properties": { 503 | "Node name for S&R": "KSampler" 504 | }, 505 | "widgets_values": [ 506 | 947679663182103, 507 | "fixed", 508 | 16, 509 | 1.5, 510 | "lcm", 511 | "sgm_uniform", 512 | 1 513 | ] 514 | }, 515 | { 516 | "id": 65, 517 | "type": "Reroute", 518 | "pos": [ 519 | 1777, 520 | -10 521 | ], 522 | "size": [ 523 | 75, 524 | 26 525 | ], 526 | "flags": {}, 527 | "order": 8, 528 | "mode": 0, 529 | "inputs": [ 530 | { 531 | "name": "", 532 | "type": "*", 533 | "link": 193 534 | } 535 | ], 536 | "outputs": [ 537 | { 538 | "name": "", 539 | "type": "LATENT", 540 | "links": [ 541 | 194 542 | ], 543 | "slot_index": 0 544 | } 545 | ], 546 | "properties": { 547 | "showOutputText": false, 548 | "horizontal": false 549 | } 550 | }, 551 | { 552 | "id": 25, 553 | "type": "VAEDecode", 554 | "pos": [ 555 | 1880, 556 | 372 557 | ], 558 | "size": { 559 | "0": 210, 560 | "1": 46 561 | }, 562 | "flags": { 563 | "collapsed": true 564 | }, 565 | "order": 13, 566 | "mode": 0, 567 | "inputs": [ 568 | { 569 | "name": "samples", 570 | "type": "LATENT", 571 | "link": 183 572 | }, 573 | { 574 | "name": "vae", 575 | "type": "VAE", 576 | "link": 196, 577 | "slot_index": 1 578 | } 579 | ], 580 | "outputs": [ 581 | { 582 | "name": "IMAGE", 583 | "type": "IMAGE", 584 | "links": [ 585 | 97 586 | ], 587 | "shape": 3, 588 | "slot_index": 0 589 | } 590 | ], 591 | "properties": { 592 | "Node name for S&R": "VAEDecode" 593 | } 594 | }, 595 | { 596 | "id": 64, 597 | "type": "EditBasicPipe", 598 | "pos": [ 599 | 2478, 600 | 129 601 | ], 602 | "size": { 603 | "0": 267, 604 | "1": 126 605 | }, 606 | "flags": {}, 607 | "order": 17, 608 | "mode": 0, 609 | "inputs": [ 610 | { 611 | "name": "basic_pipe", 612 | "type": "BASIC_PIPE", 613 | "link": 232 614 | }, 615 | { 616 | "name": "model", 617 | "type": "MODEL", 618 | "link": null 619 | }, 620 | { 621 | "name": "clip", 622 | "type": "CLIP", 623 | "link": null 624 | }, 625 | { 626 | "name": "vae", 627 | "type": "VAE", 628 | "link": null 629 | }, 630 | { 631 | "name": "positive", 632 | "type": "CONDITIONING", 633 | "link": 191 634 | }, 635 | { 636 | "name": "negative", 637 | "type": "CONDITIONING", 638 | "link": null 639 | } 640 | ], 641 | "outputs": [ 642 | { 643 | "name": "basic_pipe", 644 | "type": "BASIC_PIPE", 645 | "links": [ 646 | 198 647 | ], 648 | "shape": 3, 649 | "slot_index": 0 650 | } 651 | ], 652 | "properties": { 653 | "Node name for S&R": "EditBasicPipe" 654 | } 655 | }, 656 | { 657 | "id": 68, 658 | "type": "Reroute", 659 | "pos": [ 660 | 2759, 661 | -80 662 | ], 663 | "size": [ 664 | 75, 665 | 26 666 | ], 667 | "flags": {}, 668 | "order": 18, 669 | "mode": 0, 670 | "inputs": [ 671 | { 672 | "name": "", 673 | "type": "*", 674 | "link": 198 675 | } 676 | ], 677 | "outputs": [ 678 | { 679 | "name": "", 680 | "type": "BASIC_PIPE", 681 | "links": [ 682 | 199 683 | ], 684 | "slot_index": 0 685 | } 686 | ], 687 | "properties": { 688 | "showOutputText": false, 689 | "horizontal": false 690 | } 691 | }, 692 | { 693 | "id": 66, 694 | "type": "Reroute", 695 | "pos": [ 696 | 2793, 697 | -10 698 | ], 699 | "size": [ 700 | 75, 701 | 26 702 | ], 703 | "flags": {}, 704 | "order": 10, 705 | "mode": 0, 706 | "inputs": [ 707 | { 708 | "name": "", 709 | "type": "*", 710 | "link": 194 711 | } 712 | ], 713 | "outputs": [ 714 | { 715 | "name": "", 716 | "type": "LATENT", 717 | "links": [ 718 | 195 719 | ], 720 | "slot_index": 0 721 | } 722 | ], 723 | "properties": { 724 | "showOutputText": false, 725 | "horizontal": false 726 | } 727 | }, 728 | { 729 | "id": 46, 730 | "type": "Latent Upscale by Factor (WAS)", 731 | "pos": [ 732 | 2880, 733 | 141 734 | ], 735 | "size": { 736 | "0": 315, 737 | "1": 106 738 | }, 739 | "flags": {}, 740 | "order": 12, 741 | "mode": 0, 742 | "inputs": [ 743 | { 744 | "name": "samples", 745 | "type": "LATENT", 746 | "link": 195, 747 | "slot_index": 0 748 | } 749 | ], 750 | "outputs": [ 751 | { 752 | "name": "LATENT", 753 | "type": "LATENT", 754 | "links": [ 755 | 197 756 | ], 757 | "shape": 3, 758 | "slot_index": 0 759 | } 760 | ], 761 | "properties": { 762 | "Node name for S&R": "Latent Upscale by Factor (WAS)" 763 | }, 764 | "widgets_values": [ 765 | "bilinear", 766 | 2, 767 | "true" 768 | ] 769 | }, 770 | { 771 | "id": 67, 772 | "type": "Reroute", 773 | "pos": [ 774 | 3284, 775 | -12 776 | ], 777 | "size": [ 778 | 75, 779 | 26 780 | ], 781 | "flags": {}, 782 | "order": 14, 783 | "mode": 0, 784 | "inputs": [ 785 | { 786 | "name": "", 787 | "type": "*", 788 | "link": 197 789 | } 790 | ], 791 | "outputs": [ 792 | { 793 | "name": "", 794 | "type": "LATENT", 795 | "links": [ 796 | 200 797 | ], 798 | "slot_index": 0 799 | } 800 | ], 801 | "properties": { 802 | "showOutputText": false, 803 | "horizontal": false 804 | } 805 | }, 806 | { 807 | "id": 37, 808 | "type": "FromBasicPipe_v2", 809 | "pos": [ 810 | 3173, 811 | -51 812 | ], 813 | "size": { 814 | "0": 267, 815 | "1": 126 816 | }, 817 | "flags": { 818 | "collapsed": true 819 | }, 820 | "order": 19, 821 | "mode": 0, 822 | "inputs": [ 823 | { 824 | "name": "basic_pipe", 825 | "type": "BASIC_PIPE", 826 | "link": 199, 827 | "slot_index": 0 828 | } 829 | ], 830 | "outputs": [ 831 | { 832 | "name": "basic_pipe", 833 | "type": "BASIC_PIPE", 834 | "links": [ 835 | 125 836 | ], 837 | "shape": 3, 838 | "slot_index": 0 839 | }, 840 | { 841 | "name": "model", 842 | "type": "MODEL", 843 | "links": [ 844 | 104 845 | ], 846 | "shape": 3, 847 | "slot_index": 1 848 | }, 849 | { 850 | "name": "clip", 851 | "type": "CLIP", 852 | "links": null, 853 | "shape": 3 854 | }, 855 | { 856 | "name": "vae", 857 | "type": "VAE", 858 | "links": [], 859 | "shape": 3, 860 | "slot_index": 3 861 | }, 862 | { 863 | "name": "positive", 864 | "type": "CONDITIONING", 865 | "links": [ 866 | 102, 867 | 105 868 | ], 869 | "shape": 3, 870 | "slot_index": 4 871 | }, 872 | { 873 | "name": "negative", 874 | "type": "CONDITIONING", 875 | "links": [ 876 | 103, 877 | 106 878 | ], 879 | "shape": 3, 880 | "slot_index": 5 881 | } 882 | ], 883 | "properties": { 884 | "Node name for S&R": "FromBasicPipe_v2" 885 | } 886 | }, 887 | { 888 | "id": 1, 889 | "type": "Batch Unsampler", 890 | "pos": [ 891 | 3326, 892 | 128 893 | ], 894 | "size": [ 895 | 335.16957687500053, 896 | 321.12790622070327 897 | ], 898 | "flags": {}, 899 | "order": 21, 900 | "mode": 0, 901 | "inputs": [ 902 | { 903 | "name": "model", 904 | "type": "MODEL", 905 | "link": 178, 906 | "slot_index": 0 907 | }, 908 | { 909 | "name": "positive", 910 | "type": "CONDITIONING", 911 | "link": 102, 912 | "slot_index": 1 913 | }, 914 | { 915 | "name": "negative", 916 | "type": "CONDITIONING", 917 | "link": 103, 918 | "slot_index": 2 919 | }, 920 | { 921 | "name": "latent_image", 922 | "type": "LATENT", 923 | "link": 200, 924 | "slot_index": 3 925 | } 926 | ], 927 | "outputs": [ 928 | { 929 | "name": "latent_batch", 930 | "type": "LATENT", 931 | "links": [ 932 | 107 933 | ], 934 | "shape": 3, 935 | "slot_index": 0 936 | } 937 | ], 938 | "properties": { 939 | "Node name for S&R": "Batch Unsampler" 940 | }, 941 | "widgets_values": [ 942 | 30, 943 | 0, 944 | 1, 945 | 1.5, 946 | "lcm", 947 | "sgm_uniform", 948 | "disable" 949 | ] 950 | }, 951 | { 952 | "id": 43, 953 | "type": "Iterative Mixing KSampler", 954 | "pos": [ 955 | 3691, 956 | 134 957 | ], 958 | "size": { 959 | "0": 317.4000244140625, 960 | "1": 310 961 | }, 962 | "flags": {}, 963 | "order": 23, 964 | "mode": 0, 965 | "inputs": [ 966 | { 967 | "name": "model", 968 | "type": "MODEL", 969 | "link": 104 970 | }, 971 | { 972 | "name": "positive", 973 | "type": "CONDITIONING", 974 | "link": 105 975 | }, 976 | { 977 | "name": "negative", 978 | "type": "CONDITIONING", 979 | "link": 106 980 | }, 981 | { 982 | "name": "latent_image_batch", 983 | "type": "LATENT", 984 | "link": 107 985 | } 986 | ], 987 | "outputs": [ 988 | { 989 | "name": "LATENT", 990 | "type": "LATENT", 991 | "links": [ 992 | 109 993 | ], 994 | "shape": 3, 995 | "slot_index": 0 996 | } 997 | ], 998 | "properties": { 999 | "Node name for S&R": "Iterative Mixing KSampler" 1000 | }, 1001 | "widgets_values": [ 1002 | 229873120776553, 1003 | "fixed", 1004 | 1.5, 1005 | "lcm", 1006 | "sgm_uniform", 1007 | 1, 1008 | 1, 1009 | 0.30000000000000004, 1010 | true 1011 | ] 1012 | }, 1013 | { 1014 | "id": 28, 1015 | "type": "LatentFromBatch", 1016 | "pos": [ 1017 | 4107, 1018 | 133 1019 | ], 1020 | "size": { 1021 | "0": 315, 1022 | "1": 82 1023 | }, 1024 | "flags": {}, 1025 | "order": 25, 1026 | "mode": 0, 1027 | "inputs": [ 1028 | { 1029 | "name": "samples", 1030 | "type": "LATENT", 1031 | "link": 109 1032 | } 1033 | ], 1034 | "outputs": [ 1035 | { 1036 | "name": "LATENT", 1037 | "type": "LATENT", 1038 | "links": [ 1039 | 61, 1040 | 202 1041 | ], 1042 | "shape": 3, 1043 | "slot_index": 0 1044 | } 1045 | ], 1046 | "title": "Get the final latent from mixing", 1047 | "properties": { 1048 | "Node name for S&R": "LatentFromBatch" 1049 | }, 1050 | "widgets_values": [ 1051 | 30, 1052 | 1 1053 | ] 1054 | }, 1055 | { 1056 | "id": 13, 1057 | "type": "PreviewImage", 1058 | "pos": [ 1059 | 4448, 1060 | 284 1061 | ], 1062 | "size": [ 1063 | 288.535003125, 1064 | 328.159656347656 1065 | ], 1066 | "flags": {}, 1067 | "order": 31, 1068 | "mode": 0, 1069 | "inputs": [ 1070 | { 1071 | "name": "images", 1072 | "type": "IMAGE", 1073 | "link": 24 1074 | } 1075 | ], 1076 | "title": "It looks a bit rough", 1077 | "properties": { 1078 | "Node name for S&R": "PreviewImage" 1079 | } 1080 | }, 1081 | { 1082 | "id": 71, 1083 | "type": "PreviewImage", 1084 | "pos": [ 1085 | 6820.105859374996, 1086 | 265.3437538146974 1087 | ], 1088 | "size": { 1089 | "0": 288.5350036621094, 1090 | "1": 328.15966796875 1091 | }, 1092 | "flags": {}, 1093 | "order": 39, 1094 | "mode": 0, 1095 | "inputs": [ 1096 | { 1097 | "name": "images", 1098 | "type": "IMAGE", 1099 | "link": 208 1100 | } 1101 | ], 1102 | "title": "It looks a bit rough", 1103 | "properties": { 1104 | "Node name for S&R": "PreviewImage" 1105 | } 1106 | }, 1107 | { 1108 | "id": 72, 1109 | "type": "LatentFromBatch", 1110 | "pos": [ 1111 | 6480.105859374996, 1112 | 115.34375381469727 1113 | ], 1114 | "size": { 1115 | "0": 315, 1116 | "1": 82 1117 | }, 1118 | "flags": {}, 1119 | "order": 36, 1120 | "mode": 0, 1121 | "inputs": [ 1122 | { 1123 | "name": "samples", 1124 | "type": "LATENT", 1125 | "link": 209 1126 | } 1127 | ], 1128 | "outputs": [ 1129 | { 1130 | "name": "LATENT", 1131 | "type": "LATENT", 1132 | "links": [ 1133 | 207, 1134 | 213 1135 | ], 1136 | "shape": 3, 1137 | "slot_index": 0 1138 | } 1139 | ], 1140 | "title": "Get the final latent from mixing", 1141 | "properties": { 1142 | "Node name for S&R": "LatentFromBatch" 1143 | }, 1144 | "widgets_values": [ 1145 | 30, 1146 | 1 1147 | ] 1148 | }, 1149 | { 1150 | "id": 75, 1151 | "type": "Iterative Mixing KSampler", 1152 | "pos": [ 1153 | 6060.105859374996, 1154 | 115.34375381469727 1155 | ], 1156 | "size": { 1157 | "0": 317.4000244140625, 1158 | "1": 310 1159 | }, 1160 | "flags": {}, 1161 | "order": 35, 1162 | "mode": 0, 1163 | "inputs": [ 1164 | { 1165 | "name": "model", 1166 | "type": "MODEL", 1167 | "link": 215 1168 | }, 1169 | { 1170 | "name": "positive", 1171 | "type": "CONDITIONING", 1172 | "link": 216 1173 | }, 1174 | { 1175 | "name": "negative", 1176 | "type": "CONDITIONING", 1177 | "link": 217 1178 | }, 1179 | { 1180 | "name": "latent_image_batch", 1181 | "type": "LATENT", 1182 | "link": 218 1183 | } 1184 | ], 1185 | "outputs": [ 1186 | { 1187 | "name": "LATENT", 1188 | "type": "LATENT", 1189 | "links": [ 1190 | 209 1191 | ], 1192 | "shape": 3, 1193 | "slot_index": 0 1194 | } 1195 | ], 1196 | "properties": { 1197 | "Node name for S&R": "Iterative Mixing KSampler" 1198 | }, 1199 | "widgets_values": [ 1200 | 229873120776553, 1201 | "fixed", 1202 | 1.5, 1203 | "lcm", 1204 | "sgm_uniform", 1205 | 1, 1206 | 1, 1207 | 0.30000000000000004, 1208 | true 1209 | ] 1210 | }, 1211 | { 1212 | "id": 77, 1213 | "type": "FromBasicPipe_v2", 1214 | "pos": [ 1215 | 6760, 1216 | -70 1217 | ], 1218 | "size": { 1219 | "0": 267, 1220 | "1": 126 1221 | }, 1222 | "flags": { 1223 | "collapsed": true 1224 | }, 1225 | "order": 26, 1226 | "mode": 0, 1227 | "inputs": [ 1228 | { 1229 | "name": "basic_pipe", 1230 | "type": "BASIC_PIPE", 1231 | "link": 220 1232 | } 1233 | ], 1234 | "outputs": [ 1235 | { 1236 | "name": "basic_pipe", 1237 | "type": "BASIC_PIPE", 1238 | "links": [], 1239 | "shape": 3, 1240 | "slot_index": 0 1241 | }, 1242 | { 1243 | "name": "model", 1244 | "type": "MODEL", 1245 | "links": [ 1246 | 210 1247 | ], 1248 | "shape": 3, 1249 | "slot_index": 1 1250 | }, 1251 | { 1252 | "name": "clip", 1253 | "type": "CLIP", 1254 | "links": null, 1255 | "shape": 3, 1256 | "slot_index": 2 1257 | }, 1258 | { 1259 | "name": "vae", 1260 | "type": "VAE", 1261 | "links": [ 1262 | 229, 1263 | 230 1264 | ], 1265 | "shape": 3, 1266 | "slot_index": 3 1267 | }, 1268 | { 1269 | "name": "positive", 1270 | "type": "CONDITIONING", 1271 | "links": [ 1272 | 211 1273 | ], 1274 | "shape": 3, 1275 | "slot_index": 4 1276 | }, 1277 | { 1278 | "name": "negative", 1279 | "type": "CONDITIONING", 1280 | "links": [ 1281 | 212 1282 | ], 1283 | "shape": 3, 1284 | "slot_index": 5 1285 | } 1286 | ], 1287 | "properties": { 1288 | "Node name for S&R": "FromBasicPipe_v2" 1289 | } 1290 | }, 1291 | { 1292 | "id": 79, 1293 | "type": "Reroute", 1294 | "pos": [ 1295 | 5670, 1296 | -30 1297 | ], 1298 | "size": [ 1299 | 75, 1300 | 26 1301 | ], 1302 | "flags": {}, 1303 | "order": 32, 1304 | "mode": 0, 1305 | "inputs": [ 1306 | { 1307 | "name": "", 1308 | "type": "*", 1309 | "link": 222 1310 | } 1311 | ], 1312 | "outputs": [ 1313 | { 1314 | "name": "", 1315 | "type": "LATENT", 1316 | "links": [ 1317 | 206 1318 | ] 1319 | } 1320 | ], 1321 | "properties": { 1322 | "showOutputText": false, 1323 | "horizontal": false 1324 | } 1325 | }, 1326 | { 1327 | "id": 51, 1328 | "type": "FromBasicPipe_v2", 1329 | "pos": [ 1330 | 4436, 1331 | -76 1332 | ], 1333 | "size": { 1334 | "0": 267, 1335 | "1": 126 1336 | }, 1337 | "flags": { 1338 | "collapsed": true 1339 | }, 1340 | "order": 20, 1341 | "mode": 0, 1342 | "inputs": [ 1343 | { 1344 | "name": "basic_pipe", 1345 | "type": "BASIC_PIPE", 1346 | "link": 125 1347 | } 1348 | ], 1349 | "outputs": [ 1350 | { 1351 | "name": "basic_pipe", 1352 | "type": "BASIC_PIPE", 1353 | "links": [ 1354 | 224 1355 | ], 1356 | "shape": 3, 1357 | "slot_index": 0 1358 | }, 1359 | { 1360 | "name": "model", 1361 | "type": "MODEL", 1362 | "links": [ 1363 | 126 1364 | ], 1365 | "shape": 3, 1366 | "slot_index": 1 1367 | }, 1368 | { 1369 | "name": "clip", 1370 | "type": "CLIP", 1371 | "links": null, 1372 | "shape": 3, 1373 | "slot_index": 2 1374 | }, 1375 | { 1376 | "name": "vae", 1377 | "type": "VAE", 1378 | "links": [ 1379 | 231 1380 | ], 1381 | "shape": 3, 1382 | "slot_index": 3 1383 | }, 1384 | { 1385 | "name": "positive", 1386 | "type": "CONDITIONING", 1387 | "links": [ 1388 | 128 1389 | ], 1390 | "shape": 3, 1391 | "slot_index": 4 1392 | }, 1393 | { 1394 | "name": "negative", 1395 | "type": "CONDITIONING", 1396 | "links": [ 1397 | 129 1398 | ], 1399 | "shape": 3, 1400 | "slot_index": 5 1401 | } 1402 | ], 1403 | "properties": { 1404 | "Node name for S&R": "FromBasicPipe_v2" 1405 | } 1406 | }, 1407 | { 1408 | "id": 80, 1409 | "type": "Reroute", 1410 | "pos": [ 1411 | 5150, 1412 | -110 1413 | ], 1414 | "size": [ 1415 | 75, 1416 | 26 1417 | ], 1418 | "flags": {}, 1419 | "order": 22, 1420 | "mode": 0, 1421 | "inputs": [ 1422 | { 1423 | "name": "", 1424 | "type": "*", 1425 | "link": 224, 1426 | "slot_index": 0 1427 | } 1428 | ], 1429 | "outputs": [ 1430 | { 1431 | "name": "", 1432 | "type": "BASIC_PIPE", 1433 | "links": [ 1434 | 214 1435 | ] 1436 | } 1437 | ], 1438 | "properties": { 1439 | "showOutputText": false, 1440 | "horizontal": false 1441 | } 1442 | }, 1443 | { 1444 | "id": 76, 1445 | "type": "Latent Upscale by Factor (WAS)", 1446 | "pos": [ 1447 | 5260.105859374996, 1448 | 125.34375381469727 1449 | ], 1450 | "size": { 1451 | "0": 315, 1452 | "1": 106 1453 | }, 1454 | "flags": {}, 1455 | "order": 29, 1456 | "mode": 0, 1457 | "inputs": [ 1458 | { 1459 | "name": "samples", 1460 | "type": "LATENT", 1461 | "link": 225, 1462 | "slot_index": 0 1463 | } 1464 | ], 1465 | "outputs": [ 1466 | { 1467 | "name": "LATENT", 1468 | "type": "LATENT", 1469 | "links": [ 1470 | 222 1471 | ], 1472 | "shape": 3, 1473 | "slot_index": 0 1474 | } 1475 | ], 1476 | "properties": { 1477 | "Node name for S&R": "Latent Upscale by Factor (WAS)" 1478 | }, 1479 | "widgets_values": [ 1480 | "bilinear", 1481 | 2, 1482 | "true" 1483 | ] 1484 | }, 1485 | { 1486 | "id": 74, 1487 | "type": "FromBasicPipe_v2", 1488 | "pos": [ 1489 | 5560, 1490 | -70 1491 | ], 1492 | "size": { 1493 | "0": 267, 1494 | "1": 126 1495 | }, 1496 | "flags": { 1497 | "collapsed": true 1498 | }, 1499 | "order": 24, 1500 | "mode": 0, 1501 | "inputs": [ 1502 | { 1503 | "name": "basic_pipe", 1504 | "type": "BASIC_PIPE", 1505 | "link": 214, 1506 | "slot_index": 0 1507 | } 1508 | ], 1509 | "outputs": [ 1510 | { 1511 | "name": "basic_pipe", 1512 | "type": "BASIC_PIPE", 1513 | "links": [ 1514 | 220 1515 | ], 1516 | "shape": 3, 1517 | "slot_index": 0 1518 | }, 1519 | { 1520 | "name": "model", 1521 | "type": "MODEL", 1522 | "links": [ 1523 | 215, 1524 | 226 1525 | ], 1526 | "shape": 3, 1527 | "slot_index": 1 1528 | }, 1529 | { 1530 | "name": "clip", 1531 | "type": "CLIP", 1532 | "links": null, 1533 | "shape": 3 1534 | }, 1535 | { 1536 | "name": "vae", 1537 | "type": "VAE", 1538 | "links": [ 1539 | 234 1540 | ], 1541 | "shape": 3, 1542 | "slot_index": 3 1543 | }, 1544 | { 1545 | "name": "positive", 1546 | "type": "CONDITIONING", 1547 | "links": [ 1548 | 204, 1549 | 216 1550 | ], 1551 | "shape": 3, 1552 | "slot_index": 4 1553 | }, 1554 | { 1555 | "name": "negative", 1556 | "type": "CONDITIONING", 1557 | "links": [ 1558 | 205, 1559 | 217 1560 | ], 1561 | "shape": 3, 1562 | "slot_index": 5 1563 | } 1564 | ], 1565 | "properties": { 1566 | "Node name for S&R": "FromBasicPipe_v2" 1567 | } 1568 | }, 1569 | { 1570 | "id": 73, 1571 | "type": "KSampler", 1572 | "pos": [ 1573 | 7150.105859374996, 1574 | 125.34375381469727 1575 | ], 1576 | "size": { 1577 | "0": 315, 1578 | "1": 474 1579 | }, 1580 | "flags": {}, 1581 | "order": 38, 1582 | "mode": 0, 1583 | "inputs": [ 1584 | { 1585 | "name": "model", 1586 | "type": "MODEL", 1587 | "link": 210 1588 | }, 1589 | { 1590 | "name": "positive", 1591 | "type": "CONDITIONING", 1592 | "link": 211 1593 | }, 1594 | { 1595 | "name": "negative", 1596 | "type": "CONDITIONING", 1597 | "link": 212, 1598 | "slot_index": 2 1599 | }, 1600 | { 1601 | "name": "latent_image", 1602 | "type": "LATENT", 1603 | "link": 213 1604 | } 1605 | ], 1606 | "outputs": [ 1607 | { 1608 | "name": "LATENT", 1609 | "type": "LATENT", 1610 | "links": [ 1611 | 227 1612 | ], 1613 | "shape": 3, 1614 | "slot_index": 0 1615 | } 1616 | ], 1617 | "title": "Even lighter additional sampling", 1618 | "properties": { 1619 | "Node name for S&R": "KSampler" 1620 | }, 1621 | "widgets_values": [ 1622 | 1062713514594647, 1623 | "fixed", 1624 | 8, 1625 | 1.5, 1626 | "lcm", 1627 | "sgm_uniform", 1628 | 0.15 1629 | ] 1630 | }, 1631 | { 1632 | "id": 70, 1633 | "type": "VAEDecode", 1634 | "pos": [ 1635 | 6850, 1636 | 205 1637 | ], 1638 | "size": { 1639 | "0": 210, 1640 | "1": 46 1641 | }, 1642 | "flags": { 1643 | "collapsed": true 1644 | }, 1645 | "order": 37, 1646 | "mode": 0, 1647 | "inputs": [ 1648 | { 1649 | "name": "samples", 1650 | "type": "LATENT", 1651 | "link": 207 1652 | }, 1653 | { 1654 | "name": "vae", 1655 | "type": "VAE", 1656 | "link": 230, 1657 | "slot_index": 1 1658 | } 1659 | ], 1660 | "outputs": [ 1661 | { 1662 | "name": "IMAGE", 1663 | "type": "IMAGE", 1664 | "links": [ 1665 | 208 1666 | ], 1667 | "shape": 3, 1668 | "slot_index": 0 1669 | } 1670 | ], 1671 | "properties": { 1672 | "Node name for S&R": "VAEDecode" 1673 | } 1674 | }, 1675 | { 1676 | "id": 12, 1677 | "type": "VAEDecode", 1678 | "pos": [ 1679 | 4484, 1680 | 220 1681 | ], 1682 | "size": { 1683 | "0": 210, 1684 | "1": 46 1685 | }, 1686 | "flags": { 1687 | "collapsed": true 1688 | }, 1689 | "order": 28, 1690 | "mode": 0, 1691 | "inputs": [ 1692 | { 1693 | "name": "samples", 1694 | "type": "LATENT", 1695 | "link": 202 1696 | }, 1697 | { 1698 | "name": "vae", 1699 | "type": "VAE", 1700 | "link": 231, 1701 | "slot_index": 1 1702 | } 1703 | ], 1704 | "outputs": [ 1705 | { 1706 | "name": "IMAGE", 1707 | "type": "IMAGE", 1708 | "links": [ 1709 | 24 1710 | ], 1711 | "shape": 3, 1712 | "slot_index": 0 1713 | } 1714 | ], 1715 | "properties": { 1716 | "Node name for S&R": "VAEDecode" 1717 | } 1718 | }, 1719 | { 1720 | "id": 62, 1721 | "type": "FromBasicPipe_v2", 1722 | "pos": [ 1723 | 1821, 1724 | 133 1725 | ], 1726 | "size": { 1727 | "0": 267, 1728 | "1": 126 1729 | }, 1730 | "flags": {}, 1731 | "order": 11, 1732 | "mode": 0, 1733 | "inputs": [ 1734 | { 1735 | "name": "basic_pipe", 1736 | "type": "BASIC_PIPE", 1737 | "link": 189 1738 | } 1739 | ], 1740 | "outputs": [ 1741 | { 1742 | "name": "basic_pipe", 1743 | "type": "BASIC_PIPE", 1744 | "links": [ 1745 | 232 1746 | ], 1747 | "shape": 3, 1748 | "slot_index": 0 1749 | }, 1750 | { 1751 | "name": "model", 1752 | "type": "MODEL", 1753 | "links": null, 1754 | "shape": 3 1755 | }, 1756 | { 1757 | "name": "clip", 1758 | "type": "CLIP", 1759 | "links": null, 1760 | "shape": 3 1761 | }, 1762 | { 1763 | "name": "vae", 1764 | "type": "VAE", 1765 | "links": [ 1766 | 196 1767 | ], 1768 | "shape": 3, 1769 | "slot_index": 3 1770 | }, 1771 | { 1772 | "name": "positive", 1773 | "type": "CONDITIONING", 1774 | "links": [ 1775 | 190 1776 | ], 1777 | "shape": 3, 1778 | "slot_index": 4 1779 | }, 1780 | { 1781 | "name": "negative", 1782 | "type": "CONDITIONING", 1783 | "links": null, 1784 | "shape": 3 1785 | } 1786 | ], 1787 | "properties": { 1788 | "Node name for S&R": "FromBasicPipe_v2" 1789 | } 1790 | }, 1791 | { 1792 | "id": 69, 1793 | "type": "Batch Unsampler", 1794 | "pos": [ 1795 | 5700, 1796 | 115 1797 | ], 1798 | "size": { 1799 | "0": 335.1695861816406, 1800 | "1": 321.1278991699219 1801 | }, 1802 | "flags": {}, 1803 | "order": 34, 1804 | "mode": 0, 1805 | "inputs": [ 1806 | { 1807 | "name": "model", 1808 | "type": "MODEL", 1809 | "link": 226, 1810 | "slot_index": 0 1811 | }, 1812 | { 1813 | "name": "positive", 1814 | "type": "CONDITIONING", 1815 | "link": 204, 1816 | "slot_index": 1 1817 | }, 1818 | { 1819 | "name": "negative", 1820 | "type": "CONDITIONING", 1821 | "link": 205, 1822 | "slot_index": 2 1823 | }, 1824 | { 1825 | "name": "latent_image", 1826 | "type": "LATENT", 1827 | "link": 206, 1828 | "slot_index": 3 1829 | } 1830 | ], 1831 | "outputs": [ 1832 | { 1833 | "name": "latent_batch", 1834 | "type": "LATENT", 1835 | "links": [ 1836 | 218 1837 | ], 1838 | "shape": 3, 1839 | "slot_index": 0 1840 | } 1841 | ], 1842 | "title": "Batch Unsampler - try half the steps", 1843 | "properties": { 1844 | "Node name for S&R": "Batch Unsampler" 1845 | }, 1846 | "widgets_values": [ 1847 | 15, 1848 | 0, 1849 | 1, 1850 | 1.5, 1851 | "lcm", 1852 | "sgm_uniform", 1853 | "disable" 1854 | ] 1855 | }, 1856 | { 1857 | "id": 29, 1858 | "type": "KSampler", 1859 | "pos": [ 1860 | 4779, 1861 | 137 1862 | ], 1863 | "size": { 1864 | "0": 315, 1865 | "1": 474 1866 | }, 1867 | "flags": {}, 1868 | "order": 27, 1869 | "mode": 0, 1870 | "inputs": [ 1871 | { 1872 | "name": "model", 1873 | "type": "MODEL", 1874 | "link": 126 1875 | }, 1876 | { 1877 | "name": "positive", 1878 | "type": "CONDITIONING", 1879 | "link": 128 1880 | }, 1881 | { 1882 | "name": "negative", 1883 | "type": "CONDITIONING", 1884 | "link": 129, 1885 | "slot_index": 2 1886 | }, 1887 | { 1888 | "name": "latent_image", 1889 | "type": "LATENT", 1890 | "link": 61 1891 | } 1892 | ], 1893 | "outputs": [ 1894 | { 1895 | "name": "LATENT", 1896 | "type": "LATENT", 1897 | "links": [ 1898 | 225, 1899 | 233 1900 | ], 1901 | "shape": 3, 1902 | "slot_index": 0 1903 | } 1904 | ], 1905 | "title": "Light additional sampling", 1906 | "properties": { 1907 | "Node name for S&R": "KSampler" 1908 | }, 1909 | "widgets_values": [ 1910 | 1062713514594647, 1911 | "fixed", 1912 | 8, 1913 | 1.5, 1914 | "lcm", 1915 | "sgm_uniform", 1916 | 0.5 1917 | ] 1918 | }, 1919 | { 1920 | "id": 83, 1921 | "type": "VAEDecode", 1922 | "pos": [ 1923 | 5265, 1924 | 285 1925 | ], 1926 | "size": { 1927 | "0": 210, 1928 | "1": 46 1929 | }, 1930 | "flags": {}, 1931 | "order": 30, 1932 | "mode": 0, 1933 | "inputs": [ 1934 | { 1935 | "name": "samples", 1936 | "type": "LATENT", 1937 | "link": 233 1938 | }, 1939 | { 1940 | "name": "vae", 1941 | "type": "VAE", 1942 | "link": 234, 1943 | "slot_index": 1 1944 | } 1945 | ], 1946 | "outputs": [ 1947 | { 1948 | "name": "IMAGE", 1949 | "type": "IMAGE", 1950 | "links": [ 1951 | 235 1952 | ], 1953 | "shape": 3, 1954 | "slot_index": 0 1955 | } 1956 | ], 1957 | "properties": { 1958 | "Node name for S&R": "VAEDecode" 1959 | } 1960 | }, 1961 | { 1962 | "id": 84, 1963 | "type": "PreviewImage", 1964 | "pos": [ 1965 | 5271, 1966 | 376 1967 | ], 1968 | "size": [ 1969 | 382.41650390625, 1970 | 277.7330627441406 1971 | ], 1972 | "flags": {}, 1973 | "order": 33, 1974 | "mode": 0, 1975 | "inputs": [ 1976 | { 1977 | "name": "images", 1978 | "type": "IMAGE", 1979 | "link": 235 1980 | } 1981 | ], 1982 | "title": "Result of last step", 1983 | "properties": { 1984 | "Node name for S&R": "PreviewImage" 1985 | } 1986 | }, 1987 | { 1988 | "id": 81, 1989 | "type": "VAEDecode", 1990 | "pos": [ 1991 | 7563, 1992 | 97 1993 | ], 1994 | "size": { 1995 | "0": 210, 1996 | "1": 46 1997 | }, 1998 | "flags": {}, 1999 | "order": 40, 2000 | "mode": 0, 2001 | "inputs": [ 2002 | { 2003 | "name": "samples", 2004 | "type": "LATENT", 2005 | "link": 227 2006 | }, 2007 | { 2008 | "name": "vae", 2009 | "type": "VAE", 2010 | "link": 229, 2011 | "slot_index": 1 2012 | } 2013 | ], 2014 | "outputs": [ 2015 | { 2016 | "name": "IMAGE", 2017 | "type": "IMAGE", 2018 | "links": [ 2019 | 228 2020 | ], 2021 | "shape": 3, 2022 | "slot_index": 0 2023 | } 2024 | ], 2025 | "properties": { 2026 | "Node name for S&R": "VAEDecode" 2027 | } 2028 | }, 2029 | { 2030 | "id": 82, 2031 | "type": "PreviewImage", 2032 | "pos": [ 2033 | 7822, 2034 | 72 2035 | ], 2036 | "size": [ 2037 | 742.0979687500012, 2038 | 634.796306152344 2039 | ], 2040 | "flags": {}, 2041 | "order": 41, 2042 | "mode": 0, 2043 | "inputs": [ 2044 | { 2045 | "name": "images", 2046 | "type": "IMAGE", 2047 | "link": 228 2048 | } 2049 | ], 2050 | "title": "Final 4x Output", 2051 | "properties": { 2052 | "Node name for S&R": "PreviewImage" 2053 | } 2054 | } 2055 | ], 2056 | "links": [ 2057 | [ 2058 | 24, 2059 | 12, 2060 | 0, 2061 | 13, 2062 | 0, 2063 | "IMAGE" 2064 | ], 2065 | [ 2066 | 61, 2067 | 28, 2068 | 0, 2069 | 29, 2070 | 3, 2071 | "LATENT" 2072 | ], 2073 | [ 2074 | 97, 2075 | 25, 2076 | 0, 2077 | 41, 2078 | 0, 2079 | "IMAGE" 2080 | ], 2081 | [ 2082 | 98, 2083 | 41, 2084 | 0, 2085 | 40, 2086 | 2, 2087 | "IMAGE" 2088 | ], 2089 | [ 2090 | 99, 2091 | 42, 2092 | 0, 2093 | 40, 2094 | 1, 2095 | "CONTROL_NET" 2096 | ], 2097 | [ 2098 | 102, 2099 | 37, 2100 | 4, 2101 | 1, 2102 | 1, 2103 | "CONDITIONING" 2104 | ], 2105 | [ 2106 | 103, 2107 | 37, 2108 | 5, 2109 | 1, 2110 | 2, 2111 | "CONDITIONING" 2112 | ], 2113 | [ 2114 | 104, 2115 | 37, 2116 | 1, 2117 | 43, 2118 | 0, 2119 | "MODEL" 2120 | ], 2121 | [ 2122 | 105, 2123 | 37, 2124 | 4, 2125 | 43, 2126 | 1, 2127 | "CONDITIONING" 2128 | ], 2129 | [ 2130 | 106, 2131 | 37, 2132 | 5, 2133 | 43, 2134 | 2, 2135 | "CONDITIONING" 2136 | ], 2137 | [ 2138 | 107, 2139 | 1, 2140 | 0, 2141 | 43, 2142 | 3, 2143 | "LATENT" 2144 | ], 2145 | [ 2146 | 109, 2147 | 43, 2148 | 0, 2149 | 28, 2150 | 0, 2151 | "LATENT" 2152 | ], 2153 | [ 2154 | 125, 2155 | 37, 2156 | 0, 2157 | 51, 2158 | 0, 2159 | "BASIC_PIPE" 2160 | ], 2161 | [ 2162 | 126, 2163 | 51, 2164 | 1, 2165 | 29, 2166 | 0, 2167 | "MODEL" 2168 | ], 2169 | [ 2170 | 128, 2171 | 51, 2172 | 4, 2173 | 29, 2174 | 1, 2175 | "CONDITIONING" 2176 | ], 2177 | [ 2178 | 129, 2179 | 51, 2180 | 5, 2181 | 29, 2182 | 2, 2183 | "CONDITIONING" 2184 | ], 2185 | [ 2186 | 168, 2187 | 2, 2188 | 0, 2189 | 32, 2190 | 0, 2191 | "MODEL" 2192 | ], 2193 | [ 2194 | 169, 2195 | 2, 2196 | 1, 2197 | 32, 2198 | 1, 2199 | "CLIP" 2200 | ], 2201 | [ 2202 | 170, 2203 | 32, 2204 | 1, 2205 | 3, 2206 | 0, 2207 | "CLIP" 2208 | ], 2209 | [ 2210 | 171, 2211 | 32, 2212 | 1, 2213 | 4, 2214 | 0, 2215 | "CLIP" 2216 | ], 2217 | [ 2218 | 172, 2219 | 32, 2220 | 0, 2221 | 22, 2222 | 0, 2223 | "MODEL" 2224 | ], 2225 | [ 2226 | 173, 2227 | 3, 2228 | 0, 2229 | 22, 2230 | 1, 2231 | "CONDITIONING" 2232 | ], 2233 | [ 2234 | 174, 2235 | 4, 2236 | 0, 2237 | 22, 2238 | 2, 2239 | "CONDITIONING" 2240 | ], 2241 | [ 2242 | 175, 2243 | 23, 2244 | 0, 2245 | 22, 2246 | 3, 2247 | "LATENT" 2248 | ], 2249 | [ 2250 | 177, 2251 | 2, 2252 | 2, 2253 | 35, 2254 | 2, 2255 | "VAE" 2256 | ], 2257 | [ 2258 | 178, 2259 | 32, 2260 | 0, 2261 | 1, 2262 | 0, 2263 | "MODEL" 2264 | ], 2265 | [ 2266 | 179, 2267 | 32, 2268 | 0, 2269 | 35, 2270 | 0, 2271 | "MODEL" 2272 | ], 2273 | [ 2274 | 180, 2275 | 32, 2276 | 1, 2277 | 35, 2278 | 1, 2279 | "CLIP" 2280 | ], 2281 | [ 2282 | 182, 2283 | 4, 2284 | 0, 2285 | 35, 2286 | 4, 2287 | "CONDITIONING" 2288 | ], 2289 | [ 2290 | 183, 2291 | 22, 2292 | 0, 2293 | 25, 2294 | 0, 2295 | "LATENT" 2296 | ], 2297 | [ 2298 | 186, 2299 | 35, 2300 | 0, 2301 | 60, 2302 | 0, 2303 | "*" 2304 | ], 2305 | [ 2306 | 189, 2307 | 60, 2308 | 0, 2309 | 62, 2310 | 0, 2311 | "BASIC_PIPE" 2312 | ], 2313 | [ 2314 | 190, 2315 | 62, 2316 | 4, 2317 | 40, 2318 | 0, 2319 | "CONDITIONING" 2320 | ], 2321 | [ 2322 | 191, 2323 | 40, 2324 | 0, 2325 | 64, 2326 | 4, 2327 | "CONDITIONING" 2328 | ], 2329 | [ 2330 | 192, 2331 | 3, 2332 | 0, 2333 | 35, 2334 | 3, 2335 | "CONDITIONING" 2336 | ], 2337 | [ 2338 | 193, 2339 | 22, 2340 | 0, 2341 | 65, 2342 | 0, 2343 | "*" 2344 | ], 2345 | [ 2346 | 194, 2347 | 65, 2348 | 0, 2349 | 66, 2350 | 0, 2351 | "*" 2352 | ], 2353 | [ 2354 | 195, 2355 | 66, 2356 | 0, 2357 | 46, 2358 | 0, 2359 | "LATENT" 2360 | ], 2361 | [ 2362 | 196, 2363 | 62, 2364 | 3, 2365 | 25, 2366 | 1, 2367 | "VAE" 2368 | ], 2369 | [ 2370 | 197, 2371 | 46, 2372 | 0, 2373 | 67, 2374 | 0, 2375 | "*" 2376 | ], 2377 | [ 2378 | 198, 2379 | 64, 2380 | 0, 2381 | 68, 2382 | 0, 2383 | "*" 2384 | ], 2385 | [ 2386 | 199, 2387 | 68, 2388 | 0, 2389 | 37, 2390 | 0, 2391 | "BASIC_PIPE" 2392 | ], 2393 | [ 2394 | 200, 2395 | 67, 2396 | 0, 2397 | 1, 2398 | 3, 2399 | "LATENT" 2400 | ], 2401 | [ 2402 | 202, 2403 | 28, 2404 | 0, 2405 | 12, 2406 | 0, 2407 | "LATENT" 2408 | ], 2409 | [ 2410 | 204, 2411 | 74, 2412 | 4, 2413 | 69, 2414 | 1, 2415 | "CONDITIONING" 2416 | ], 2417 | [ 2418 | 205, 2419 | 74, 2420 | 5, 2421 | 69, 2422 | 2, 2423 | "CONDITIONING" 2424 | ], 2425 | [ 2426 | 206, 2427 | 79, 2428 | 0, 2429 | 69, 2430 | 3, 2431 | "LATENT" 2432 | ], 2433 | [ 2434 | 207, 2435 | 72, 2436 | 0, 2437 | 70, 2438 | 0, 2439 | "LATENT" 2440 | ], 2441 | [ 2442 | 208, 2443 | 70, 2444 | 0, 2445 | 71, 2446 | 0, 2447 | "IMAGE" 2448 | ], 2449 | [ 2450 | 209, 2451 | 75, 2452 | 0, 2453 | 72, 2454 | 0, 2455 | "LATENT" 2456 | ], 2457 | [ 2458 | 210, 2459 | 77, 2460 | 1, 2461 | 73, 2462 | 0, 2463 | "MODEL" 2464 | ], 2465 | [ 2466 | 211, 2467 | 77, 2468 | 4, 2469 | 73, 2470 | 1, 2471 | "CONDITIONING" 2472 | ], 2473 | [ 2474 | 212, 2475 | 77, 2476 | 5, 2477 | 73, 2478 | 2, 2479 | "CONDITIONING" 2480 | ], 2481 | [ 2482 | 213, 2483 | 72, 2484 | 0, 2485 | 73, 2486 | 3, 2487 | "LATENT" 2488 | ], 2489 | [ 2490 | 214, 2491 | 80, 2492 | 0, 2493 | 74, 2494 | 0, 2495 | "BASIC_PIPE" 2496 | ], 2497 | [ 2498 | 215, 2499 | 74, 2500 | 1, 2501 | 75, 2502 | 0, 2503 | "MODEL" 2504 | ], 2505 | [ 2506 | 216, 2507 | 74, 2508 | 4, 2509 | 75, 2510 | 1, 2511 | "CONDITIONING" 2512 | ], 2513 | [ 2514 | 217, 2515 | 74, 2516 | 5, 2517 | 75, 2518 | 2, 2519 | "CONDITIONING" 2520 | ], 2521 | [ 2522 | 218, 2523 | 69, 2524 | 0, 2525 | 75, 2526 | 3, 2527 | "LATENT" 2528 | ], 2529 | [ 2530 | 220, 2531 | 74, 2532 | 0, 2533 | 77, 2534 | 0, 2535 | "BASIC_PIPE" 2536 | ], 2537 | [ 2538 | 222, 2539 | 76, 2540 | 0, 2541 | 79, 2542 | 0, 2543 | "*" 2544 | ], 2545 | [ 2546 | 224, 2547 | 51, 2548 | 0, 2549 | 80, 2550 | 0, 2551 | "*" 2552 | ], 2553 | [ 2554 | 225, 2555 | 29, 2556 | 0, 2557 | 76, 2558 | 0, 2559 | "LATENT" 2560 | ], 2561 | [ 2562 | 226, 2563 | 74, 2564 | 1, 2565 | 69, 2566 | 0, 2567 | "MODEL" 2568 | ], 2569 | [ 2570 | 227, 2571 | 73, 2572 | 0, 2573 | 81, 2574 | 0, 2575 | "LATENT" 2576 | ], 2577 | [ 2578 | 228, 2579 | 81, 2580 | 0, 2581 | 82, 2582 | 0, 2583 | "IMAGE" 2584 | ], 2585 | [ 2586 | 229, 2587 | 77, 2588 | 3, 2589 | 81, 2590 | 1, 2591 | "VAE" 2592 | ], 2593 | [ 2594 | 230, 2595 | 77, 2596 | 3, 2597 | 70, 2598 | 1, 2599 | "VAE" 2600 | ], 2601 | [ 2602 | 231, 2603 | 51, 2604 | 3, 2605 | 12, 2606 | 1, 2607 | "VAE" 2608 | ], 2609 | [ 2610 | 232, 2611 | 62, 2612 | 0, 2613 | 64, 2614 | 0, 2615 | "BASIC_PIPE" 2616 | ], 2617 | [ 2618 | 233, 2619 | 29, 2620 | 0, 2621 | 83, 2622 | 0, 2623 | "LATENT" 2624 | ], 2625 | [ 2626 | 234, 2627 | 74, 2628 | 3, 2629 | 83, 2630 | 1, 2631 | "VAE" 2632 | ], 2633 | [ 2634 | 235, 2635 | 83, 2636 | 0, 2637 | 84, 2638 | 0, 2639 | "IMAGE" 2640 | ] 2641 | ], 2642 | "groups": [ 2643 | { 2644 | "title": "Create an image at low resolution using any sampler", 2645 | "bounding": [ 2646 | 67, 2647 | 46, 2648 | 1665, 2649 | 614 2650 | ], 2651 | "color": "#3f789e", 2652 | "font_size": 24, 2653 | "locked": false 2654 | }, 2655 | { 2656 | "title": "Apply one or more ControlNets", 2657 | "bounding": [ 2658 | 1769, 2659 | 45, 2660 | 998, 2661 | 617 2662 | ], 2663 | "color": "#3f789e", 2664 | "font_size": 24, 2665 | "locked": false 2666 | }, 2667 | { 2668 | "title": "Upscale latents by 2x", 2669 | "bounding": [ 2670 | 2791, 2671 | 42, 2672 | 478, 2673 | 619 2674 | ], 2675 | "color": "#3f789e", 2676 | "font_size": 24, 2677 | "locked": false 2678 | }, 2679 | { 2680 | "title": "Unsample the 2x upscale and then de-noise again", 2681 | "bounding": [ 2682 | 3290, 2683 | 42, 2684 | 774, 2685 | 620 2686 | ], 2687 | "color": "#3f789e", 2688 | "font_size": 24, 2689 | "locked": false 2690 | }, 2691 | { 2692 | "title": "De-noise lightly for a few more steps", 2693 | "bounding": [ 2694 | 4074, 2695 | 43, 2696 | 1136, 2697 | 620 2698 | ], 2699 | "color": "#3f789e", 2700 | "font_size": 24, 2701 | "locked": false 2702 | }, 2703 | { 2704 | "title": "Double scale and go through this all again", 2705 | "bounding": [ 2706 | 5240, 2707 | 41, 2708 | 2241, 2709 | 630 2710 | ], 2711 | "color": "#3f789e", 2712 | "font_size": 24, 2713 | "locked": false 2714 | } 2715 | ], 2716 | "config": {}, 2717 | "extra": { 2718 | "groupNodes": { 2719 | "Generate an image any way you want at low resolution": { 2720 | "nodes": [ 2721 | { 2722 | "type": "EmptyLatentImage", 2723 | "pos": [ 2724 | 100, 2725 | 306 2726 | ], 2727 | "size": { 2728 | "0": 315, 2729 | "1": 106 2730 | }, 2731 | "flags": {}, 2732 | "order": 0, 2733 | "mode": 0, 2734 | "outputs": [ 2735 | { 2736 | "name": "LATENT", 2737 | "type": "LATENT", 2738 | "links": [], 2739 | "shape": 3 2740 | } 2741 | ], 2742 | "properties": { 2743 | "Node name for S&R": "EmptyLatentImage" 2744 | }, 2745 | "widgets_values": [ 2746 | 512, 2747 | 512, 2748 | 1 2749 | ], 2750 | "index": 0 2751 | }, 2752 | { 2753 | "type": "CheckpointLoaderSimple", 2754 | "pos": [ 2755 | 100, 2756 | 542 2757 | ], 2758 | "size": { 2759 | "0": 315, 2760 | "1": 98 2761 | }, 2762 | "flags": {}, 2763 | "order": 1, 2764 | "mode": 0, 2765 | "outputs": [ 2766 | { 2767 | "name": "MODEL", 2768 | "type": "MODEL", 2769 | "links": [], 2770 | "shape": 3, 2771 | "slot_index": 0 2772 | }, 2773 | { 2774 | "name": "CLIP", 2775 | "type": "CLIP", 2776 | "links": [], 2777 | "shape": 3, 2778 | "slot_index": 1 2779 | }, 2780 | { 2781 | "name": "VAE", 2782 | "type": "VAE", 2783 | "links": [], 2784 | "shape": 3 2785 | } 2786 | ], 2787 | "properties": { 2788 | "Node name for S&R": "CheckpointLoaderSimple" 2789 | }, 2790 | "widgets_values": [ 2791 | "epic-realism.safetensors" 2792 | ], 2793 | "index": 1 2794 | }, 2795 | { 2796 | "type": "LoraLoader", 2797 | "pos": [ 2798 | 515, 2799 | 130 2800 | ], 2801 | "size": { 2802 | "0": 315, 2803 | "1": 126 2804 | }, 2805 | "flags": {}, 2806 | "order": 3, 2807 | "mode": 0, 2808 | "inputs": [ 2809 | { 2810 | "name": "model", 2811 | "type": "MODEL", 2812 | "link": null 2813 | }, 2814 | { 2815 | "name": "clip", 2816 | "type": "CLIP", 2817 | "link": null 2818 | } 2819 | ], 2820 | "outputs": [ 2821 | { 2822 | "name": "MODEL", 2823 | "type": "MODEL", 2824 | "links": [], 2825 | "shape": 3, 2826 | "slot_index": 0 2827 | }, 2828 | { 2829 | "name": "CLIP", 2830 | "type": "CLIP", 2831 | "links": [], 2832 | "shape": 3, 2833 | "slot_index": 1 2834 | } 2835 | ], 2836 | "properties": { 2837 | "Node name for S&R": "LoraLoader" 2838 | }, 2839 | "widgets_values": [ 2840 | "lcm_sd15.safetensors", 2841 | 1, 2842 | 1 2843 | ], 2844 | "index": 2 2845 | }, 2846 | { 2847 | "type": "CLIPTextEncode", 2848 | "pos": [ 2849 | 930, 2850 | 130 2851 | ], 2852 | "size": { 2853 | "0": 400, 2854 | "1": 200 2855 | }, 2856 | "flags": {}, 2857 | "order": 4, 2858 | "mode": 0, 2859 | "inputs": [ 2860 | { 2861 | "name": "clip", 2862 | "type": "CLIP", 2863 | "link": null 2864 | } 2865 | ], 2866 | "outputs": [ 2867 | { 2868 | "name": "CONDITIONING", 2869 | "type": "CONDITIONING", 2870 | "links": [], 2871 | "shape": 3, 2872 | "slot_index": 0 2873 | } 2874 | ], 2875 | "properties": { 2876 | "Node name for S&R": "CLIPTextEncode" 2877 | }, 2878 | "widgets_values": [ 2879 | "a man in a cafe" 2880 | ], 2881 | "color": "#232", 2882 | "bgcolor": "#353", 2883 | "index": 3 2884 | }, 2885 | { 2886 | "type": "CLIPTextEncode", 2887 | "pos": [ 2888 | 930, 2889 | 460 2890 | ], 2891 | "size": { 2892 | "0": 400, 2893 | "1": 200 2894 | }, 2895 | "flags": {}, 2896 | "order": 5, 2897 | "mode": 0, 2898 | "inputs": [ 2899 | { 2900 | "name": "clip", 2901 | "type": "CLIP", 2902 | "link": null 2903 | } 2904 | ], 2905 | "outputs": [ 2906 | { 2907 | "name": "CONDITIONING", 2908 | "type": "CONDITIONING", 2909 | "links": [], 2910 | "shape": 3, 2911 | "slot_index": 0 2912 | } 2913 | ], 2914 | "properties": { 2915 | "Node name for S&R": "CLIPTextEncode" 2916 | }, 2917 | "widgets_values": [ 2918 | "" 2919 | ], 2920 | "color": "#322", 2921 | "bgcolor": "#533", 2922 | "index": 4 2923 | }, 2924 | { 2925 | "type": "KSampler", 2926 | "pos": [ 2927 | 1430, 2928 | 130 2929 | ], 2930 | "size": { 2931 | "0": 315, 2932 | "1": 474 2933 | }, 2934 | "flags": {}, 2935 | "order": 6, 2936 | "mode": 0, 2937 | "inputs": [ 2938 | { 2939 | "name": "model", 2940 | "type": "MODEL", 2941 | "link": null, 2942 | "slot_index": 0 2943 | }, 2944 | { 2945 | "name": "positive", 2946 | "type": "CONDITIONING", 2947 | "link": null, 2948 | "slot_index": 1 2949 | }, 2950 | { 2951 | "name": "negative", 2952 | "type": "CONDITIONING", 2953 | "link": null, 2954 | "slot_index": 2 2955 | }, 2956 | { 2957 | "name": "latent_image", 2958 | "type": "LATENT", 2959 | "link": null, 2960 | "slot_index": 3 2961 | } 2962 | ], 2963 | "outputs": [ 2964 | { 2965 | "name": "LATENT", 2966 | "type": "LATENT", 2967 | "links": [], 2968 | "shape": 3, 2969 | "slot_index": 0 2970 | } 2971 | ], 2972 | "properties": { 2973 | "Node name for S&R": "KSampler" 2974 | }, 2975 | "widgets_values": [ 2976 | 947679663182103, 2977 | "fixed", 2978 | 16, 2979 | 1.5, 2980 | "lcm", 2981 | "sgm_uniform", 2982 | 1 2983 | ], 2984 | "index": 5 2985 | } 2986 | ], 2987 | "links": [ 2988 | [ 2989 | 1, 2990 | 0, 2991 | 2, 2992 | 0, 2993 | 2, 2994 | "MODEL" 2995 | ], 2996 | [ 2997 | 1, 2998 | 1, 2999 | 2, 3000 | 1, 3001 | 2, 3002 | "CLIP" 3003 | ], 3004 | [ 3005 | 2, 3006 | 1, 3007 | 3, 3008 | 0, 3009 | 32, 3010 | "CLIP" 3011 | ], 3012 | [ 3013 | 2, 3014 | 1, 3015 | 4, 3016 | 0, 3017 | 32, 3018 | "CLIP" 3019 | ], 3020 | [ 3021 | 2, 3022 | 0, 3023 | 5, 3024 | 0, 3025 | 32, 3026 | "MODEL" 3027 | ], 3028 | [ 3029 | 3, 3030 | 0, 3031 | 5, 3032 | 1, 3033 | 3, 3034 | "CONDITIONING" 3035 | ], 3036 | [ 3037 | 4, 3038 | 0, 3039 | 5, 3040 | 2, 3041 | 4, 3042 | "CONDITIONING" 3043 | ], 3044 | [ 3045 | 0, 3046 | 0, 3047 | 5, 3048 | 3, 3049 | 23, 3050 | "LATENT" 3051 | ] 3052 | ], 3053 | "external": [ 3054 | [ 3055 | 1, 3056 | 2, 3057 | "VAE" 3058 | ], 3059 | [ 3060 | 2, 3061 | 0, 3062 | "MODEL" 3063 | ], 3064 | [ 3065 | 2, 3066 | 1, 3067 | "CLIP" 3068 | ], 3069 | [ 3070 | 3, 3071 | 0, 3072 | "CONDITIONING" 3073 | ], 3074 | [ 3075 | 4, 3076 | 0, 3077 | "CONDITIONING" 3078 | ], 3079 | [ 3080 | 5, 3081 | 0, 3082 | "LATENT" 3083 | ] 3084 | ] 3085 | } 3086 | } 3087 | }, 3088 | "version": 0.4 3089 | } -------------------------------------------------------------------------------- /images/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deroberon/demofusion-comfyui/ef9f9785cd241e6f41da4f2ea77a2b400e90d3ef/images/example.png -------------------------------------------------------------------------------- /images/example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deroberon/demofusion-comfyui/ef9f9785cd241e6f41da4f2ea77a2b400e90d3ef/images/example2.png -------------------------------------------------------------------------------- /pipeline_demofusion_sdxl.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import inspect 16 | import os 17 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 18 | import matplotlib.pyplot as plt 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | import numpy as np 23 | import random 24 | import warnings 25 | from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer 26 | 27 | from diffusers.image_processor import VaeImageProcessor 28 | from diffusers.loaders import ( 29 | FromSingleFileMixin, 30 | LoraLoaderMixin, 31 | TextualInversionLoaderMixin, 32 | ) 33 | from diffusers.models import AutoencoderKL, UNet2DConditionModel 34 | from diffusers.models.attention_processor import ( 35 | AttnProcessor2_0, 36 | LoRAAttnProcessor2_0, 37 | LoRAXFormersAttnProcessor, 38 | XFormersAttnProcessor, 39 | ) 40 | from diffusers.models.lora import adjust_lora_scale_text_encoder 41 | from diffusers.schedulers import KarrasDiffusionSchedulers 42 | from diffusers.utils import ( 43 | is_accelerate_available, 44 | is_accelerate_version, 45 | is_invisible_watermark_available, 46 | logging, 47 | replace_example_docstring, 48 | ) 49 | from diffusers.utils.torch_utils import randn_tensor 50 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline 51 | from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput 52 | 53 | 54 | #if is_invisible_watermark_available(): 55 | # from .watermark import StableDiffusionXLWatermarker 56 | 57 | 58 | logger = logging.get_logger(__name__) # pylint: disable=invalid-name 59 | 60 | EXAMPLE_DOC_STRING = """ 61 | Examples: 62 | ```py 63 | >>> import torch 64 | >>> from diffusers import StableDiffusionXLPipeline 65 | 66 | >>> pipe = StableDiffusionXLPipeline.from_pretrained( 67 | ... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 68 | ... ) 69 | >>> pipe = pipe.to("cuda") 70 | 71 | >>> prompt = "a photo of an astronaut riding a horse on mars" 72 | >>> image = pipe(prompt).images[0] 73 | ``` 74 | """ 75 | 76 | def gaussian_kernel(kernel_size=3, sigma=1.0, channels=3): 77 | x_coord = torch.arange(kernel_size) 78 | gaussian_1d = torch.exp(-(x_coord - (kernel_size - 1) / 2) ** 2 / (2 * sigma ** 2)) 79 | gaussian_1d = gaussian_1d / gaussian_1d.sum() 80 | gaussian_2d = gaussian_1d[:, None] * gaussian_1d[None, :] 81 | kernel = gaussian_2d[None, None, :, :].repeat(channels, 1, 1, 1) 82 | 83 | return kernel 84 | 85 | def gaussian_filter(latents, kernel_size=3, sigma=1.0): 86 | channels = latents.shape[1] 87 | kernel = gaussian_kernel(kernel_size, sigma, channels).to(latents.device, latents.dtype) 88 | blurred_latents = F.conv2d(latents, kernel, padding=kernel_size//2, groups=channels) 89 | 90 | return blurred_latents 91 | 92 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg 93 | def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): 94 | """ 95 | Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and 96 | Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 97 | """ 98 | std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) 99 | std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) 100 | # rescale the results from guidance (fixes overexposure) 101 | noise_pred_rescaled = noise_cfg * (std_text / std_cfg) 102 | # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images 103 | noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg 104 | return noise_cfg 105 | 106 | ## Have to change the name to contain the word "StableDiffusion" because of: 107 | ## https://github.com/huggingface/diffusers/blob/2d94c7838e273c40920ffd6d24d724357add7f2d/src/diffusers/loaders/single_file.py#L207C15-L207C30 108 | class DemoFusionSDXLStableDiffusionPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin): 109 | r""" 110 | Pipeline for text-to-image generation using Stable Diffusion XL. 111 | 112 | This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the 113 | library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) 114 | 115 | In addition the pipeline inherits the following loading methods: 116 | - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`] 117 | - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] 118 | 119 | as well as the following saving methods: 120 | - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`] 121 | 122 | Args: 123 | vae ([`AutoencoderKL`]): 124 | Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. 125 | text_encoder ([`CLIPTextModel`]): 126 | Frozen text-encoder. Stable Diffusion XL uses the text portion of 127 | [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically 128 | the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. 129 | text_encoder_2 ([` CLIPTextModelWithProjection`]): 130 | Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of 131 | [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), 132 | specifically the 133 | [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) 134 | variant. 135 | tokenizer (`CLIPTokenizer`): 136 | Tokenizer of class 137 | [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). 138 | tokenizer_2 (`CLIPTokenizer`): 139 | Second Tokenizer of class 140 | [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). 141 | unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. 142 | scheduler ([`SchedulerMixin`]): 143 | A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of 144 | [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. 145 | force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): 146 | Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of 147 | `stabilityai/stable-diffusion-xl-base-1-0`. 148 | add_watermarker (`bool`, *optional*): 149 | Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to 150 | watermark output images. If not defined, it will default to True if the package is installed, otherwise no 151 | watermarker will be used. 152 | """ 153 | model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae" 154 | 155 | def __init__( 156 | self, 157 | vae: AutoencoderKL, 158 | text_encoder: CLIPTextModel, 159 | text_encoder_2: CLIPTextModelWithProjection, 160 | tokenizer: CLIPTokenizer, 161 | tokenizer_2: CLIPTokenizer, 162 | unet: UNet2DConditionModel, 163 | scheduler: KarrasDiffusionSchedulers, 164 | force_zeros_for_empty_prompt: bool = True, 165 | add_watermarker: Optional[bool] = None, 166 | ): 167 | super().__init__() 168 | 169 | self.register_modules( 170 | vae=vae, 171 | text_encoder=text_encoder, 172 | text_encoder_2=text_encoder_2, 173 | tokenizer=tokenizer, 174 | tokenizer_2=tokenizer_2, 175 | unet=unet, 176 | scheduler=scheduler, 177 | ) 178 | self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) 179 | self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) 180 | self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) 181 | self.default_sample_size = self.unet.config.sample_size 182 | 183 | #add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() 184 | add_watermarker = False 185 | self.watermark = None 186 | #if add_watermarker: 187 | # self.watermark = StableDiffusionXLWatermarker() 188 | #else: 189 | # self.watermark = None 190 | 191 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing 192 | def enable_vae_slicing(self): 193 | r""" 194 | Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to 195 | compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. 196 | """ 197 | self.vae.enable_slicing() 198 | 199 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing 200 | def disable_vae_slicing(self): 201 | r""" 202 | Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to 203 | computing decoding in one step. 204 | """ 205 | self.vae.disable_slicing() 206 | 207 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling 208 | def enable_vae_tiling(self): 209 | r""" 210 | Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to 211 | compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow 212 | processing larger images. 213 | """ 214 | self.vae.enable_tiling() 215 | 216 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling 217 | def disable_vae_tiling(self): 218 | r""" 219 | Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to 220 | computing decoding in one step. 221 | """ 222 | self.vae.disable_tiling() 223 | 224 | def encode_prompt( 225 | self, 226 | prompt: str, 227 | prompt_2: Optional[str] = None, 228 | device: Optional[torch.device] = None, 229 | num_images_per_prompt: int = 1, 230 | do_classifier_free_guidance: bool = True, 231 | negative_prompt: Optional[str] = None, 232 | negative_prompt_2: Optional[str] = None, 233 | prompt_embeds: Optional[torch.FloatTensor] = None, 234 | negative_prompt_embeds: Optional[torch.FloatTensor] = None, 235 | pooled_prompt_embeds: Optional[torch.FloatTensor] = None, 236 | negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, 237 | lora_scale: Optional[float] = None, 238 | ): 239 | r""" 240 | Encodes the prompt into text encoder hidden states. 241 | 242 | Args: 243 | prompt (`str` or `List[str]`, *optional*): 244 | prompt to be encoded 245 | prompt_2 (`str` or `List[str]`, *optional*): 246 | The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is 247 | used in both text-encoders 248 | device: (`torch.device`): 249 | torch device 250 | num_images_per_prompt (`int`): 251 | number of images that should be generated per prompt 252 | do_classifier_free_guidance (`bool`): 253 | whether to use classifier free guidance or not 254 | negative_prompt (`str` or `List[str]`, *optional*): 255 | The prompt or prompts not to guide the image generation. If not defined, one has to pass 256 | `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is 257 | less than `1`). 258 | negative_prompt_2 (`str` or `List[str]`, *optional*): 259 | The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and 260 | `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders 261 | prompt_embeds (`torch.FloatTensor`, *optional*): 262 | Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not 263 | provided, text embeddings will be generated from `prompt` input argument. 264 | negative_prompt_embeds (`torch.FloatTensor`, *optional*): 265 | Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt 266 | weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input 267 | argument. 268 | pooled_prompt_embeds (`torch.FloatTensor`, *optional*): 269 | Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. 270 | If not provided, pooled text embeddings will be generated from `prompt` input argument. 271 | negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): 272 | Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt 273 | weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` 274 | input argument. 275 | lora_scale (`float`, *optional*): 276 | A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. 277 | """ 278 | device = device or self._execution_device 279 | 280 | # set lora scale so that monkey patched LoRA 281 | # function of text encoder can correctly access it 282 | if lora_scale is not None and isinstance(self, LoraLoaderMixin): 283 | self._lora_scale = lora_scale 284 | 285 | # dynamically adjust the LoRA scale 286 | adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) 287 | adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale) 288 | 289 | if prompt is not None and isinstance(prompt, str): 290 | batch_size = 1 291 | elif prompt is not None and isinstance(prompt, list): 292 | batch_size = len(prompt) 293 | else: 294 | batch_size = prompt_embeds.shape[0] 295 | 296 | # Define tokenizers and text encoders 297 | tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] 298 | text_encoders = ( 299 | [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] 300 | ) 301 | 302 | if prompt_embeds is None: 303 | prompt_2 = prompt_2 or prompt 304 | # textual inversion: procecss multi-vector tokens if necessary 305 | prompt_embeds_list = [] 306 | prompts = [prompt, prompt_2] 307 | for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): 308 | if isinstance(self, TextualInversionLoaderMixin): 309 | prompt = self.maybe_convert_prompt(prompt, tokenizer) 310 | 311 | text_inputs = tokenizer( 312 | prompt, 313 | padding="max_length", 314 | max_length=tokenizer.model_max_length, 315 | truncation=True, 316 | return_tensors="pt", 317 | ) 318 | 319 | text_input_ids = text_inputs.input_ids 320 | untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids 321 | 322 | if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( 323 | text_input_ids, untruncated_ids 324 | ): 325 | removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) 326 | logger.warning( 327 | "The following part of your input was truncated because CLIP can only handle sequences up to" 328 | f" {tokenizer.model_max_length} tokens: {removed_text}" 329 | ) 330 | 331 | prompt_embeds = text_encoder( 332 | text_input_ids.to(device), 333 | output_hidden_states=True, 334 | ) 335 | 336 | # We are only ALWAYS interested in the pooled output of the final text encoder 337 | pooled_prompt_embeds = prompt_embeds[0] 338 | prompt_embeds = prompt_embeds.hidden_states[-2] 339 | 340 | prompt_embeds_list.append(prompt_embeds) 341 | 342 | prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) 343 | 344 | # get unconditional embeddings for classifier free guidance 345 | zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt 346 | if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: 347 | negative_prompt_embeds = torch.zeros_like(prompt_embeds) 348 | negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) 349 | elif do_classifier_free_guidance and negative_prompt_embeds is None: 350 | negative_prompt = negative_prompt or "" 351 | negative_prompt_2 = negative_prompt_2 or negative_prompt 352 | 353 | uncond_tokens: List[str] 354 | if prompt is not None and type(prompt) is not type(negative_prompt): 355 | raise TypeError( 356 | f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" 357 | f" {type(prompt)}." 358 | ) 359 | elif isinstance(negative_prompt, str): 360 | uncond_tokens = [negative_prompt, negative_prompt_2] 361 | elif batch_size != len(negative_prompt): 362 | raise ValueError( 363 | f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" 364 | f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" 365 | " the batch size of `prompt`." 366 | ) 367 | else: 368 | uncond_tokens = [negative_prompt, negative_prompt_2] 369 | 370 | negative_prompt_embeds_list = [] 371 | for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): 372 | if isinstance(self, TextualInversionLoaderMixin): 373 | negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) 374 | 375 | max_length = prompt_embeds.shape[1] 376 | uncond_input = tokenizer( 377 | negative_prompt, 378 | padding="max_length", 379 | max_length=max_length, 380 | truncation=True, 381 | return_tensors="pt", 382 | ) 383 | 384 | negative_prompt_embeds = text_encoder( 385 | uncond_input.input_ids.to(device), 386 | output_hidden_states=True, 387 | ) 388 | # We are only ALWAYS interested in the pooled output of the final text encoder 389 | negative_pooled_prompt_embeds = negative_prompt_embeds[0] 390 | negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] 391 | 392 | negative_prompt_embeds_list.append(negative_prompt_embeds) 393 | 394 | negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) 395 | 396 | prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) 397 | bs_embed, seq_len, _ = prompt_embeds.shape 398 | # duplicate text embeddings for each generation per prompt, using mps friendly method 399 | prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) 400 | prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) 401 | 402 | if do_classifier_free_guidance: 403 | # duplicate unconditional embeddings for each generation per prompt, using mps friendly method 404 | seq_len = negative_prompt_embeds.shape[1] 405 | negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) 406 | negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) 407 | negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) 408 | 409 | pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( 410 | bs_embed * num_images_per_prompt, -1 411 | ) 412 | if do_classifier_free_guidance: 413 | negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( 414 | bs_embed * num_images_per_prompt, -1 415 | ) 416 | 417 | return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds 418 | 419 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs 420 | def prepare_extra_step_kwargs(self, generator, eta): 421 | # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature 422 | # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. 423 | # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 424 | # and should be between [0, 1] 425 | 426 | accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) 427 | extra_step_kwargs = {} 428 | if accepts_eta: 429 | extra_step_kwargs["eta"] = eta 430 | 431 | # check if the scheduler accepts generator 432 | accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) 433 | if accepts_generator: 434 | extra_step_kwargs["generator"] = generator 435 | return extra_step_kwargs 436 | 437 | def check_inputs( 438 | self, 439 | prompt, 440 | prompt_2, 441 | height, 442 | width, 443 | callback_steps, 444 | negative_prompt=None, 445 | negative_prompt_2=None, 446 | prompt_embeds=None, 447 | negative_prompt_embeds=None, 448 | pooled_prompt_embeds=None, 449 | negative_pooled_prompt_embeds=None, 450 | num_images_per_prompt=None, 451 | ): 452 | if height % 8 != 0 or width % 8 != 0: 453 | raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") 454 | 455 | if (callback_steps is None) or ( 456 | callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) 457 | ): 458 | raise ValueError( 459 | f"`callback_steps` has to be a positive integer but is {callback_steps} of type" 460 | f" {type(callback_steps)}." 461 | ) 462 | 463 | if prompt is not None and prompt_embeds is not None: 464 | raise ValueError( 465 | f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" 466 | " only forward one of the two." 467 | ) 468 | elif prompt_2 is not None and prompt_embeds is not None: 469 | raise ValueError( 470 | f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" 471 | " only forward one of the two." 472 | ) 473 | elif prompt is None and prompt_embeds is None: 474 | raise ValueError( 475 | "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." 476 | ) 477 | elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): 478 | raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") 479 | elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): 480 | raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") 481 | 482 | if negative_prompt is not None and negative_prompt_embeds is not None: 483 | raise ValueError( 484 | f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" 485 | f" {negative_prompt_embeds}. Please make sure to only forward one of the two." 486 | ) 487 | elif negative_prompt_2 is not None and negative_prompt_embeds is not None: 488 | raise ValueError( 489 | f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" 490 | f" {negative_prompt_embeds}. Please make sure to only forward one of the two." 491 | ) 492 | 493 | if prompt_embeds is not None and negative_prompt_embeds is not None: 494 | if prompt_embeds.shape != negative_prompt_embeds.shape: 495 | raise ValueError( 496 | "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" 497 | f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" 498 | f" {negative_prompt_embeds.shape}." 499 | ) 500 | 501 | if prompt_embeds is not None and pooled_prompt_embeds is None: 502 | raise ValueError( 503 | "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." 504 | ) 505 | 506 | if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: 507 | raise ValueError( 508 | "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." 509 | ) 510 | 511 | # DemoFusion specific checks 512 | if max(height, width) % 1024 != 0: 513 | raise ValueError(f"the larger one of `height` and `width` has to be divisible by 1024 but are {height} and {width}.") 514 | 515 | if num_images_per_prompt != 1: 516 | warnings.warn("num_images_per_prompt != 1 is not supported by DemoFusion and will be ignored.") 517 | num_images_per_prompt = 1 518 | 519 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents 520 | def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): 521 | shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) 522 | if isinstance(generator, list) and len(generator) != batch_size: 523 | raise ValueError( 524 | f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" 525 | f" size of {batch_size}. Make sure the batch size matches the length of the generators." 526 | ) 527 | 528 | if latents is None: 529 | latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) 530 | else: 531 | latents = latents.to(device) 532 | 533 | # scale the initial noise by the standard deviation required by the scheduler 534 | latents = latents * self.scheduler.init_noise_sigma 535 | return latents 536 | 537 | def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype): 538 | add_time_ids = list(original_size + crops_coords_top_left + target_size) 539 | 540 | passed_add_embed_dim = ( 541 | self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim 542 | ) 543 | expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features 544 | 545 | if expected_add_embed_dim != passed_add_embed_dim: 546 | raise ValueError( 547 | f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." 548 | ) 549 | 550 | add_time_ids = torch.tensor([add_time_ids], dtype=dtype) 551 | return add_time_ids 552 | 553 | def get_views(self, height, width, window_size=128, stride=64, random_jitter=False): 554 | # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113) 555 | # if panorama's height/width < window_size, num_blocks of height/width should return 1 556 | height //= self.vae_scale_factor 557 | width //= self.vae_scale_factor 558 | num_blocks_height = int((height - window_size) / stride - 1e-6) + 2 if height > window_size else 1 559 | num_blocks_width = int((width - window_size) / stride - 1e-6) + 2 if width > window_size else 1 560 | total_num_blocks = int(num_blocks_height * num_blocks_width) 561 | views = [] 562 | for i in range(total_num_blocks): 563 | h_start = int((i // num_blocks_width) * stride) 564 | h_end = h_start + window_size 565 | w_start = int((i % num_blocks_width) * stride) 566 | w_end = w_start + window_size 567 | 568 | if h_end > height: 569 | h_start = int(h_start + height - h_end) 570 | h_end = int(height) 571 | if w_end > width: 572 | w_start = int(w_start + width - w_end) 573 | w_end = int(width) 574 | if h_start < 0: 575 | h_end = int(h_end - h_start) 576 | h_start = 0 577 | if w_start < 0: 578 | w_end = int(w_end - w_start) 579 | w_start = 0 580 | 581 | if random_jitter: 582 | jitter_range = (window_size - stride) // 4 583 | w_jitter = 0 584 | h_jitter = 0 585 | if (w_start != 0) and (w_end != width): 586 | w_jitter = random.randint(-jitter_range, jitter_range) 587 | elif (w_start == 0) and (w_end != width): 588 | w_jitter = random.randint(-jitter_range, 0) 589 | elif (w_start != 0) and (w_end == width): 590 | w_jitter = random.randint(0, jitter_range) 591 | if (h_start != 0) and (h_end != height): 592 | h_jitter = random.randint(-jitter_range, jitter_range) 593 | elif (h_start == 0) and (h_end != height): 594 | h_jitter = random.randint(-jitter_range, 0) 595 | elif (h_start != 0) and (h_end == height): 596 | h_jitter = random.randint(0, jitter_range) 597 | h_start += (h_jitter + jitter_range) 598 | h_end += (h_jitter + jitter_range) 599 | w_start += (w_jitter + jitter_range) 600 | w_end += (w_jitter + jitter_range) 601 | 602 | views.append((h_start, h_end, w_start, w_end)) 603 | return views 604 | 605 | def tiled_decode(self, latents, current_height, current_width): 606 | sample_size = self.unet.config.sample_size 607 | core_size = self.unet.config.sample_size // 4 608 | core_stride = core_size 609 | pad_size = self.unet.config.sample_size // 8 * 3 610 | decoder_view_batch_size = 1 611 | 612 | if self.lowvram: 613 | core_stride = core_size // 2 614 | pad_size = core_size 615 | 616 | views = self.get_views(current_height, current_width, stride=core_stride, window_size=core_size) 617 | views_batch = [views[i : i + decoder_view_batch_size] for i in range(0, len(views), decoder_view_batch_size)] 618 | latents_ = F.pad(latents, (pad_size, pad_size, pad_size, pad_size), 'constant', 0) 619 | image = torch.zeros(latents.size(0), 3, current_height, current_width).to(latents.device) 620 | count = torch.zeros_like(image).to(latents.device) 621 | # get the latents corresponding to the current view coordinates 622 | with self.progress_bar(total=len(views_batch)) as progress_bar: 623 | for j, batch_view in enumerate(views_batch): 624 | vb_size = len(batch_view) 625 | latents_for_view = torch.cat( 626 | [ 627 | latents_[:, :, h_start:h_end+pad_size*2, w_start:w_end+pad_size*2] 628 | for h_start, h_end, w_start, w_end in batch_view 629 | ] 630 | ).to(self.vae.device) 631 | image_patch = self.vae.decode(latents_for_view / self.vae.config.scaling_factor, return_dict=False)[0] 632 | h_start, h_end, w_start, w_end = views[j] 633 | h_start, h_end, w_start, w_end = h_start * self.vae_scale_factor, h_end * self.vae_scale_factor, w_start * self.vae_scale_factor, w_end * self.vae_scale_factor 634 | p_h_start, p_h_end, p_w_start, p_w_end = pad_size * self.vae_scale_factor, image_patch.size(2) - pad_size * self.vae_scale_factor, pad_size * self.vae_scale_factor, image_patch.size(3) - pad_size * self.vae_scale_factor 635 | image[:, :, h_start:h_end, w_start:w_end] += image_patch[:, :, p_h_start:p_h_end, p_w_start:p_w_end].to(latents.device) 636 | count[:, :, h_start:h_end, w_start:w_end] += 1 637 | progress_bar.update() 638 | image = image / count 639 | 640 | return image 641 | 642 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae 643 | def upcast_vae(self): 644 | dtype = self.vae.dtype 645 | self.vae.to(dtype=torch.float32) 646 | use_torch_2_0_or_xformers = isinstance( 647 | self.vae.decoder.mid_block.attentions[0].processor, 648 | ( 649 | AttnProcessor2_0, 650 | XFormersAttnProcessor, 651 | LoRAXFormersAttnProcessor, 652 | LoRAAttnProcessor2_0, 653 | ), 654 | ) 655 | # if xformers or torch_2_0 is used attention block does not need 656 | # to be in float32 which can save lots of memory 657 | if use_torch_2_0_or_xformers: 658 | self.vae.post_quant_conv.to(dtype) 659 | self.vae.decoder.conv_in.to(dtype) 660 | self.vae.decoder.mid_block.to(dtype) 661 | 662 | @torch.no_grad() 663 | @replace_example_docstring(EXAMPLE_DOC_STRING) 664 | def __call__( 665 | self, 666 | prompt: Union[str, List[str]] = None, 667 | prompt_2: Optional[Union[str, List[str]]] = None, 668 | height: Optional[int] = None, 669 | width: Optional[int] = None, 670 | num_inference_steps: int = 50, 671 | denoising_end: Optional[float] = None, 672 | guidance_scale: float = 5.0, 673 | negative_prompt: Optional[Union[str, List[str]]] = None, 674 | negative_prompt_2: Optional[Union[str, List[str]]] = None, 675 | num_images_per_prompt: Optional[int] = 1, 676 | eta: float = 0.0, 677 | generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, 678 | latents: Optional[torch.FloatTensor] = None, 679 | prompt_embeds: Optional[torch.FloatTensor] = None, 680 | negative_prompt_embeds: Optional[torch.FloatTensor] = None, 681 | pooled_prompt_embeds: Optional[torch.FloatTensor] = None, 682 | negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, 683 | output_type: Optional[str] = "pil", 684 | return_dict: bool = False, 685 | callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, 686 | callback_steps: int = 1, 687 | cross_attention_kwargs: Optional[Dict[str, Any]] = None, 688 | guidance_rescale: float = 0.0, 689 | original_size: Optional[Tuple[int, int]] = None, 690 | crops_coords_top_left: Tuple[int, int] = (0, 0), 691 | target_size: Optional[Tuple[int, int]] = None, 692 | negative_original_size: Optional[Tuple[int, int]] = None, 693 | negative_crops_coords_top_left: Tuple[int, int] = (0, 0), 694 | negative_target_size: Optional[Tuple[int, int]] = None, 695 | ################### DemoFusion specific parameters #################### 696 | image_lr: Optional[torch.FloatTensor] = None, 697 | view_batch_size: int = 16, 698 | multi_decoder: bool = True, 699 | stride: Optional[int] = 64, 700 | cosine_scale_1: Optional[float] = 3., 701 | cosine_scale_2: Optional[float] = 1., 702 | cosine_scale_3: Optional[float] = 1., 703 | sigma: Optional[float] = 1.0, 704 | show_image: bool = False, 705 | lowvram: bool = False, 706 | ): 707 | r""" 708 | Function invoked when calling the pipeline for generation. 709 | 710 | Args: 711 | prompt (`str` or `List[str]`, *optional*): 712 | The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. 713 | instead. 714 | prompt_2 (`str` or `List[str]`, *optional*): 715 | The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is 716 | used in both text-encoders 717 | height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): 718 | The height in pixels of the generated image. This is set to 1024 by default for the best results. 719 | Anything below 512 pixels won't work well for 720 | [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 721 | and checkpoints that are not specifically fine-tuned on low resolutions. 722 | width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): 723 | The width in pixels of the generated image. This is set to 1024 by default for the best results. 724 | Anything below 512 pixels won't work well for 725 | [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 726 | and checkpoints that are not specifically fine-tuned on low resolutions. 727 | num_inference_steps (`int`, *optional*, defaults to 50): 728 | The number of denoising steps. More denoising steps usually lead to a higher quality image at the 729 | expense of slower inference. 730 | denoising_end (`float`, *optional*): 731 | When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be 732 | completed before it is intentionally prematurely terminated. As a result, the returned sample will 733 | still retain a substantial amount of noise as determined by the discrete timesteps selected by the 734 | scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a 735 | "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image 736 | Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) 737 | guidance_scale (`float`, *optional*, defaults to 5.0): 738 | Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). 739 | `guidance_scale` is defined as `w` of equation 2. of [Imagen 740 | Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 741 | 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, 742 | usually at the expense of lower image quality. 743 | negative_prompt (`str` or `List[str]`, *optional*): 744 | The prompt or prompts not to guide the image generation. If not defined, one has to pass 745 | `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is 746 | less than `1`). 747 | negative_prompt_2 (`str` or `List[str]`, *optional*): 748 | The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and 749 | `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders 750 | num_images_per_prompt (`int`, *optional*, defaults to 1): 751 | The number of images to generate per prompt. 752 | eta (`float`, *optional*, defaults to 0.0): 753 | Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to 754 | [`schedulers.DDIMScheduler`], will be ignored for others. 755 | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): 756 | One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) 757 | to make generation deterministic. 758 | latents (`torch.FloatTensor`, *optional*): 759 | Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image 760 | generation. Can be used to tweak the same generation with different prompts. If not provided, a latents 761 | tensor will ge generated by sampling using the supplied random `generator`. 762 | prompt_embeds (`torch.FloatTensor`, *optional*): 763 | Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not 764 | provided, text embeddings will be generated from `prompt` input argument. 765 | negative_prompt_embeds (`torch.FloatTensor`, *optional*): 766 | Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt 767 | weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input 768 | argument. 769 | pooled_prompt_embeds (`torch.FloatTensor`, *optional*): 770 | Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. 771 | If not provided, pooled text embeddings will be generated from `prompt` input argument. 772 | negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): 773 | Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt 774 | weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` 775 | input argument. 776 | output_type (`str`, *optional*, defaults to `"pil"`): 777 | The output format of the generate image. Choose between 778 | [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. 779 | return_dict (`bool`, *optional*, defaults to `True`): 780 | Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead 781 | of a plain tuple. 782 | callback (`Callable`, *optional*): 783 | A function that will be called every `callback_steps` steps during inference. The function will be 784 | called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. 785 | callback_steps (`int`, *optional*, defaults to 1): 786 | The frequency at which the `callback` function will be called. If not specified, the callback will be 787 | called at every step. 788 | cross_attention_kwargs (`dict`, *optional*): 789 | A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under 790 | `self.processor` in 791 | [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). 792 | guidance_rescale (`float`, *optional*, defaults to 0.7): 793 | Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are 794 | Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of 795 | [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). 796 | Guidance rescale factor should fix overexposure when using zero terminal SNR. 797 | original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): 798 | If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. 799 | `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as 800 | explained in section 2.2 of 801 | [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). 802 | crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): 803 | `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position 804 | `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting 805 | `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of 806 | [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). 807 | target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): 808 | For most cases, `target_size` should be set to the desired height and width of the generated image. If 809 | not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in 810 | section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). 811 | negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): 812 | To negatively condition the generation process based on a specific image resolution. Part of SDXL's 813 | micro-conditioning as explained in section 2.2 of 814 | [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more 815 | information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. 816 | negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): 817 | To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's 818 | micro-conditioning as explained in section 2.2 of 819 | [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more 820 | information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. 821 | negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): 822 | To negatively condition the generation process based on a target image resolution. It should be as same 823 | as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of 824 | [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more 825 | information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. 826 | ################### DemoFusion specific parameters #################### 827 | image_lr (`torch.FloatTensor`, *optional*, , defaults to None): 828 | Low-resolution image input for upscaling. If provided, DemoFusion will encode it as the initial latent representation. 829 | view_batch_size (`int`, defaults to 16): 830 | The batch size for multiple denoising paths. Typically, a larger batch size can result in higher 831 | efficiency but comes with increased GPU memory requirements. 832 | multi_decoder (`bool`, defaults to True): 833 | Determine whether to use a tiled decoder. Generally, when the resolution exceeds 3072x3072, 834 | a tiled decoder becomes necessary. 835 | stride (`int`, defaults to 64): 836 | The stride of moving local patches. A smaller stride is better for alleviating seam issues, 837 | but it also introduces additional computational overhead and inference time. 838 | cosine_scale_1 (`float`, defaults to 3): 839 | Control the strength of skip-residual. For specific impacts, please refer to Appendix C 840 | in the DemoFusion paper. 841 | cosine_scale_2 (`float`, defaults to 1): 842 | Control the strength of dilated sampling. For specific impacts, please refer to Appendix C 843 | in the DemoFusion paper. 844 | cosine_scale_3 (`float`, defaults to 1): 845 | Control the strength of the gaussion filter. For specific impacts, please refer to Appendix C 846 | in the DemoFusion paper. 847 | sigma (`float`, defaults to 1): 848 | The standard value of the gaussian filter. 849 | show_image (`bool`, defaults to False): 850 | Determine whether to show intermediate results during generation. 851 | lowvram (`bool`, defaults to False): 852 | Try to fit in 8 Gb of VRAM, with xformers installed. 853 | 854 | Examples: 855 | 856 | Returns: 857 | a `list` with the generated images at each phase. 858 | """ 859 | 860 | # 0. Default height and width to unet 861 | height = height or self.default_sample_size * self.vae_scale_factor 862 | width = width or self.default_sample_size * self.vae_scale_factor 863 | 864 | x1_size = self.default_sample_size * self.vae_scale_factor 865 | 866 | height_scale = height / x1_size 867 | width_scale = width / x1_size 868 | scale_num = int(max(height_scale, width_scale)) 869 | aspect_ratio = min(height_scale, width_scale) / max(height_scale, width_scale) 870 | 871 | original_size = original_size or (height, width) 872 | target_size = target_size or (height, width) 873 | 874 | # 1. Check inputs. Raise error if not correct 875 | self.check_inputs( 876 | prompt, 877 | prompt_2, 878 | height, 879 | width, 880 | callback_steps, 881 | negative_prompt, 882 | negative_prompt_2, 883 | prompt_embeds, 884 | negative_prompt_embeds, 885 | pooled_prompt_embeds, 886 | negative_pooled_prompt_embeds, 887 | num_images_per_prompt, 888 | ) 889 | 890 | # 2. Define call parameters 891 | if prompt is not None and isinstance(prompt, str): 892 | batch_size = 1 893 | elif prompt is not None and isinstance(prompt, list): 894 | batch_size = len(prompt) 895 | else: 896 | batch_size = prompt_embeds.shape[0] 897 | 898 | device = self._execution_device 899 | self.lowvram = lowvram 900 | if self.lowvram: 901 | self.vae.cpu() 902 | self.unet.cpu() 903 | self.text_encoder.to(device) 904 | self.text_encoder_2.to(device) 905 | image_lr.cpu() 906 | 907 | # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) 908 | # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` 909 | # corresponds to doing no classifier free guidance. 910 | do_classifier_free_guidance = guidance_scale > 1.0 911 | 912 | # 3. Encode input prompt 913 | text_encoder_lora_scale = ( 914 | cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None 915 | ) 916 | ( 917 | prompt_embeds, 918 | negative_prompt_embeds, 919 | pooled_prompt_embeds, 920 | negative_pooled_prompt_embeds, 921 | ) = self.encode_prompt( 922 | prompt=prompt, 923 | prompt_2=prompt_2, 924 | device=device, 925 | num_images_per_prompt=num_images_per_prompt, 926 | do_classifier_free_guidance=do_classifier_free_guidance, 927 | negative_prompt=negative_prompt, 928 | negative_prompt_2=negative_prompt_2, 929 | prompt_embeds=prompt_embeds, 930 | negative_prompt_embeds=negative_prompt_embeds, 931 | pooled_prompt_embeds=pooled_prompt_embeds, 932 | negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, 933 | lora_scale=text_encoder_lora_scale, 934 | ) 935 | 936 | # 4. Prepare timesteps 937 | self.scheduler.set_timesteps(num_inference_steps, device=device) 938 | 939 | timesteps = self.scheduler.timesteps 940 | 941 | # 5. Prepare latent variables 942 | num_channels_latents = self.unet.config.in_channels 943 | latents = self.prepare_latents( 944 | batch_size * num_images_per_prompt, 945 | num_channels_latents, 946 | height // scale_num, 947 | width // scale_num, 948 | prompt_embeds.dtype, 949 | device, 950 | generator, 951 | latents, 952 | ) 953 | 954 | # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline 955 | extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) 956 | 957 | # 7. Prepare added time ids & embeddings 958 | add_text_embeds = pooled_prompt_embeds 959 | add_time_ids = self._get_add_time_ids( 960 | original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype 961 | ) 962 | if negative_original_size is not None and negative_target_size is not None: 963 | negative_add_time_ids = self._get_add_time_ids( 964 | negative_original_size, 965 | negative_crops_coords_top_left, 966 | negative_target_size, 967 | dtype=prompt_embeds.dtype, 968 | ) 969 | else: 970 | negative_add_time_ids = add_time_ids 971 | 972 | if do_classifier_free_guidance: 973 | prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) 974 | add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) 975 | add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) 976 | del negative_prompt_embeds, negative_pooled_prompt_embeds, negative_add_time_ids 977 | 978 | prompt_embeds = prompt_embeds.to(device) 979 | add_text_embeds = add_text_embeds.to(device) 980 | add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) 981 | 982 | # 8. Denoising loop 983 | num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) 984 | 985 | # 7.1 Apply denoising_end 986 | if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1: 987 | discrete_timestep_cutoff = int( 988 | round( 989 | self.scheduler.config.num_train_timesteps 990 | - (denoising_end * self.scheduler.config.num_train_timesteps) 991 | ) 992 | ) 993 | num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) 994 | timesteps = timesteps[:num_inference_steps] 995 | 996 | output_images = [] 997 | 998 | ###################################################### Phase Initialization ######################################################## 999 | 1000 | if self.lowvram: 1001 | self.text_encoder.cpu() 1002 | self.text_encoder_2.cpu() 1003 | 1004 | if image_lr == None: 1005 | print("### Phase 1 Denoising ###") 1006 | with self.progress_bar(total=num_inference_steps) as progress_bar: 1007 | for i, t in enumerate(timesteps): 1008 | 1009 | if self.lowvram: 1010 | self.vae.cpu() 1011 | self.unet.to(device) 1012 | 1013 | latents_for_view = latents 1014 | 1015 | # expand the latents if we are doing classifier free guidance 1016 | latent_model_input = ( 1017 | latents.repeat_interleave(2, dim=0) 1018 | if do_classifier_free_guidance 1019 | else latents 1020 | ) 1021 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 1022 | 1023 | # predict the noise residual 1024 | added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} 1025 | noise_pred = self.unet( 1026 | latent_model_input, 1027 | t, 1028 | encoder_hidden_states=prompt_embeds, 1029 | cross_attention_kwargs=cross_attention_kwargs, 1030 | added_cond_kwargs=added_cond_kwargs, 1031 | return_dict=False, 1032 | )[0] 1033 | 1034 | # perform guidance 1035 | if do_classifier_free_guidance: 1036 | noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2] 1037 | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) 1038 | 1039 | if do_classifier_free_guidance and guidance_rescale > 0.0: 1040 | # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf 1041 | noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) 1042 | 1043 | # compute the previous noisy sample x_t -> x_t-1 1044 | latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] 1045 | 1046 | # call the callback, if provided 1047 | if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): 1048 | progress_bar.update() 1049 | if callback is not None and i % callback_steps == 0: 1050 | step_idx = i // getattr(self.scheduler, "order", 1) 1051 | callback(step_idx, t, latents) 1052 | del latents_for_view, latent_model_input, noise_pred, noise_pred_text, noise_pred_uncond 1053 | else: 1054 | print("### Encoding Real Image ###") 1055 | latents = self.vae.encode(image_lr) 1056 | latents = latents.latent_dist.sample() * self.vae.config.scaling_factor 1057 | 1058 | anchor_mean = latents.mean() 1059 | anchor_std = latents.std() 1060 | if self.lowvram: 1061 | latents = latents.cpu() 1062 | torch.cuda.empty_cache() 1063 | if not output_type == "latent": 1064 | # make sure the VAE is in float32 mode, as it overflows in float16 1065 | needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast 1066 | 1067 | if self.lowvram: 1068 | needs_upcasting = False # use madebyollin/sdxl-vae-fp16-fix in lowvram mode! 1069 | self.unet.cpu() 1070 | self.vae.to(device) 1071 | 1072 | if needs_upcasting: 1073 | self.upcast_vae() 1074 | latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) 1075 | if self.lowvram and multi_decoder: 1076 | current_width_height = self.unet.config.sample_size * self.vae_scale_factor 1077 | image = self.tiled_decode(latents, current_width_height, current_width_height) 1078 | else: 1079 | image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] 1080 | # cast back to fp16 if needed 1081 | if needs_upcasting: 1082 | self.vae.to(dtype=torch.float16) 1083 | 1084 | image = self.image_processor.postprocess(image, output_type=output_type) 1085 | if show_image: 1086 | plt.figure(figsize=(10, 10)) 1087 | plt.imshow(image[0]) 1088 | plt.axis('off') # Turn off axis numbers and ticks 1089 | plt.show() 1090 | output_images.append(image[0]) 1091 | 1092 | ####################################################### Phase Upscaling ##################################################### 1093 | if image_lr == None: 1094 | starting_scale = 2 1095 | else: 1096 | starting_scale = 1 1097 | for current_scale_num in range(starting_scale, scale_num + 1): 1098 | if self.lowvram: 1099 | latents = latents.to(device) 1100 | self.unet.to(device) 1101 | torch.cuda.empty_cache() 1102 | print("### Phase {} Denoising ###".format(current_scale_num)) 1103 | current_height = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num 1104 | current_width = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num 1105 | if height > width: 1106 | current_width = int(current_width * aspect_ratio) 1107 | else: 1108 | current_height = int(current_height * aspect_ratio) 1109 | 1110 | latents = F.interpolate(latents.to(device), size=(int(current_height / self.vae_scale_factor), int(current_width / self.vae_scale_factor)), mode='bicubic') 1111 | 1112 | noise_latents = [] 1113 | noise = torch.randn_like(latents) 1114 | for timestep in timesteps: 1115 | noise_latent = self.scheduler.add_noise(latents, noise, timestep.unsqueeze(0)) 1116 | noise_latents.append(noise_latent) 1117 | latents = noise_latents[0] 1118 | 1119 | with self.progress_bar(total=num_inference_steps) as progress_bar: 1120 | for i, t in enumerate(timesteps): 1121 | count = torch.zeros_like(latents) 1122 | value = torch.zeros_like(latents) 1123 | cosine_factor = 0.5 * (1 + torch.cos(torch.pi * (self.scheduler.config.num_train_timesteps - t) / self.scheduler.config.num_train_timesteps)).cpu() 1124 | 1125 | c1 = cosine_factor ** cosine_scale_1 1126 | latents = latents * (1 - c1) + noise_latents[i] * c1 1127 | 1128 | ############################################# MultiDiffusion ############################################# 1129 | 1130 | views = self.get_views(current_height, current_width, stride=stride, window_size=self.unet.config.sample_size, random_jitter=True) 1131 | views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)] 1132 | 1133 | jitter_range = (self.unet.config.sample_size - stride) // 4 1134 | latents_ = F.pad(latents, (jitter_range, jitter_range, jitter_range, jitter_range), 'constant', 0) 1135 | 1136 | count_local = torch.zeros_like(latents_) 1137 | value_local = torch.zeros_like(latents_) 1138 | 1139 | for j, batch_view in enumerate(views_batch): 1140 | vb_size = len(batch_view) 1141 | 1142 | # get the latents corresponding to the current view coordinates 1143 | latents_for_view = torch.cat( 1144 | [ 1145 | latents_[:, :, h_start:h_end, w_start:w_end] 1146 | for h_start, h_end, w_start, w_end in batch_view 1147 | ] 1148 | ) 1149 | 1150 | # expand the latents if we are doing classifier free guidance 1151 | latent_model_input = latents_for_view 1152 | latent_model_input = ( 1153 | latent_model_input.repeat_interleave(2, dim=0) 1154 | if do_classifier_free_guidance 1155 | else latent_model_input 1156 | ) 1157 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 1158 | 1159 | prompt_embeds_input = torch.cat([prompt_embeds] * vb_size) 1160 | add_text_embeds_input = torch.cat([add_text_embeds] * vb_size) 1161 | add_time_ids_input = [] 1162 | for h_start, h_end, w_start, w_end in batch_view: 1163 | add_time_ids_ = add_time_ids.clone() 1164 | add_time_ids_[:, 2] = h_start * self.vae_scale_factor 1165 | add_time_ids_[:, 3] = w_start * self.vae_scale_factor 1166 | add_time_ids_input.append(add_time_ids_) 1167 | add_time_ids_input = torch.cat(add_time_ids_input) 1168 | 1169 | # predict the noise residual 1170 | added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input} 1171 | noise_pred = self.unet( 1172 | latent_model_input, 1173 | t, 1174 | encoder_hidden_states=prompt_embeds_input, 1175 | cross_attention_kwargs=cross_attention_kwargs, 1176 | added_cond_kwargs=added_cond_kwargs, 1177 | return_dict=False, 1178 | )[0] 1179 | 1180 | if do_classifier_free_guidance: 1181 | noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2] 1182 | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) 1183 | 1184 | if do_classifier_free_guidance and guidance_rescale > 0.0: 1185 | # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf 1186 | noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) 1187 | 1188 | # compute the previous noisy sample x_t -> x_t-1 1189 | self.scheduler._init_step_index(t) 1190 | latents_denoised_batch = self.scheduler.step( 1191 | noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False)[0] 1192 | 1193 | # extract value from batch 1194 | for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip( 1195 | latents_denoised_batch.chunk(vb_size), batch_view 1196 | ): 1197 | value_local[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised 1198 | count_local[:, :, h_start:h_end, w_start:w_end] += 1 1199 | 1200 | value_local = value_local[: ,:, jitter_range: jitter_range + current_height // self.vae_scale_factor, jitter_range: jitter_range + current_width // self.vae_scale_factor] 1201 | count_local = count_local[: ,:, jitter_range: jitter_range + current_height // self.vae_scale_factor, jitter_range: jitter_range + current_width // self.vae_scale_factor] 1202 | 1203 | c2 = cosine_factor ** cosine_scale_2 1204 | 1205 | value += value_local / count_local * (1 - c2) 1206 | count += torch.ones_like(value_local) * (1 - c2) 1207 | 1208 | ############################################# Dilated Sampling ############################################# 1209 | 1210 | views = [[h, w] for h in range(current_scale_num) for w in range(current_scale_num)] 1211 | views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)] 1212 | 1213 | h_pad = (current_scale_num - (latents.size(2) % current_scale_num)) % current_scale_num 1214 | w_pad = (current_scale_num - (latents.size(3) % current_scale_num)) % current_scale_num 1215 | latents_ = F.pad(latents, (w_pad, 0, h_pad, 0), 'constant', 0) 1216 | 1217 | count_global = torch.zeros_like(latents_) 1218 | value_global = torch.zeros_like(latents_) 1219 | 1220 | c3 = 0.99 * cosine_factor ** cosine_scale_3 + 1e-2 1221 | std_, mean_ = latents_.std(), latents_.mean() 1222 | latents_gaussian = gaussian_filter(latents_, kernel_size=(2*current_scale_num-1), sigma=sigma*c3) 1223 | latents_gaussian = (latents_gaussian - latents_gaussian.mean()) / latents_gaussian.std() * std_ + mean_ 1224 | 1225 | for j, batch_view in enumerate(views_batch): 1226 | latents_for_view = torch.cat( 1227 | [ 1228 | latents_[:, :, h::current_scale_num, w::current_scale_num] 1229 | for h, w in batch_view 1230 | ] 1231 | ) 1232 | latents_for_view_gaussian = torch.cat( 1233 | [ 1234 | latents_gaussian[:, :, h::current_scale_num, w::current_scale_num] 1235 | for h, w in batch_view 1236 | ] 1237 | ) 1238 | 1239 | vb_size = latents_for_view.size(0) 1240 | 1241 | # expand the latents if we are doing classifier free guidance 1242 | latent_model_input = latents_for_view_gaussian 1243 | latent_model_input = ( 1244 | latent_model_input.repeat_interleave(2, dim=0) 1245 | if do_classifier_free_guidance 1246 | else latent_model_input 1247 | ) 1248 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 1249 | 1250 | prompt_embeds_input = torch.cat([prompt_embeds] * vb_size) 1251 | add_text_embeds_input = torch.cat([add_text_embeds] * vb_size) 1252 | add_time_ids_input = torch.cat([add_time_ids] * vb_size) 1253 | 1254 | # predict the noise residual 1255 | added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input} 1256 | noise_pred = self.unet( 1257 | latent_model_input, 1258 | t, 1259 | encoder_hidden_states=prompt_embeds_input, 1260 | cross_attention_kwargs=cross_attention_kwargs, 1261 | added_cond_kwargs=added_cond_kwargs, 1262 | return_dict=False, 1263 | )[0] 1264 | 1265 | if do_classifier_free_guidance: 1266 | noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2] 1267 | noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) 1268 | 1269 | if do_classifier_free_guidance and guidance_rescale > 0.0: 1270 | # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf 1271 | noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) 1272 | 1273 | # compute the previous noisy sample x_t -> x_t-1 1274 | self.scheduler._init_step_index(t) 1275 | latents_denoised_batch = self.scheduler.step( 1276 | noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False)[0] 1277 | 1278 | # extract value from batch 1279 | for latents_view_denoised, (h, w) in zip( 1280 | latents_denoised_batch.chunk(vb_size), batch_view 1281 | ): 1282 | value_global[:, :, h::current_scale_num, w::current_scale_num] += latents_view_denoised 1283 | count_global[:, :, h::current_scale_num, w::current_scale_num] += 1 1284 | 1285 | c2 = cosine_factor ** cosine_scale_2 1286 | 1287 | value_global = value_global[: ,:, h_pad:, w_pad:] 1288 | 1289 | value += value_global * c2 1290 | count += torch.ones_like(value_global) * c2 1291 | 1292 | ########################################################### 1293 | 1294 | latents = torch.where(count > 0, value / count, value) 1295 | 1296 | # call the callback, if provided 1297 | if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): 1298 | progress_bar.update() 1299 | if callback is not None and i % callback_steps == 0: 1300 | step_idx = i // getattr(self.scheduler, "order", 1) 1301 | callback(step_idx, t, latents) 1302 | 1303 | ######################################################################################################################################### 1304 | 1305 | latents = (latents - latents.mean()) / latents.std() * anchor_std + anchor_mean 1306 | if self.lowvram: 1307 | latents = latents.cpu() 1308 | torch.cuda.empty_cache() 1309 | if not output_type == "latent": 1310 | # make sure the VAE is in float32 mode, as it overflows in float16 1311 | needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast 1312 | 1313 | if self.lowvram: 1314 | needs_upcasting = False # use madebyollin/sdxl-vae-fp16-fix in lowvram mode! 1315 | self.unet.cpu() 1316 | self.vae.to(device) 1317 | 1318 | if needs_upcasting: 1319 | self.upcast_vae() 1320 | latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) 1321 | 1322 | print("### Phase {} Decoding ###".format(current_scale_num)) 1323 | if multi_decoder: 1324 | image = self.tiled_decode(latents, current_height, current_width) 1325 | else: 1326 | image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] 1327 | 1328 | # cast back to fp16 if needed 1329 | if needs_upcasting: 1330 | self.vae.to(dtype=torch.float16) 1331 | else: 1332 | image = latents 1333 | 1334 | if not output_type == "latent": 1335 | image = self.image_processor.postprocess(image, output_type=output_type) 1336 | if show_image: 1337 | plt.figure(figsize=(10, 10)) 1338 | plt.imshow(image[0]) 1339 | plt.axis('off') # Turn off axis numbers and ticks 1340 | plt.show() 1341 | output_images.append(image[0]) 1342 | 1343 | # Offload all models 1344 | self.maybe_free_model_hooks() 1345 | 1346 | return output_images 1347 | 1348 | # Overrride to properly handle the loading and unloading of the additional text encoder. 1349 | def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs): 1350 | # We could have accessed the unet config from `lora_state_dict()` too. We pass 1351 | # it here explicitly to be able to tell that it's coming from an SDXL 1352 | # pipeline. 1353 | 1354 | # Remove any existing hooks. 1355 | if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): 1356 | from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module 1357 | else: 1358 | raise ImportError("Offloading requires `accelerate v0.17.0` or higher.") 1359 | 1360 | is_model_cpu_offload = False 1361 | is_sequential_cpu_offload = False 1362 | recursive = False 1363 | for _, component in self.components.items(): 1364 | if isinstance(component, torch.nn.Module): 1365 | if hasattr(component, "_hf_hook"): 1366 | is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) 1367 | is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) 1368 | logger.info( 1369 | "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." 1370 | ) 1371 | recursive = is_sequential_cpu_offload 1372 | remove_hook_from_module(component, recurse=recursive) 1373 | state_dict, network_alphas = self.lora_state_dict( 1374 | pretrained_model_name_or_path_or_dict, 1375 | unet_config=self.unet.config, 1376 | **kwargs, 1377 | ) 1378 | self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet) 1379 | 1380 | text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k} 1381 | if len(text_encoder_state_dict) > 0: 1382 | self.load_lora_into_text_encoder( 1383 | text_encoder_state_dict, 1384 | network_alphas=network_alphas, 1385 | text_encoder=self.text_encoder, 1386 | prefix="text_encoder", 1387 | lora_scale=self.lora_scale, 1388 | ) 1389 | 1390 | text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k} 1391 | if len(text_encoder_2_state_dict) > 0: 1392 | self.load_lora_into_text_encoder( 1393 | text_encoder_2_state_dict, 1394 | network_alphas=network_alphas, 1395 | text_encoder=self.text_encoder_2, 1396 | prefix="text_encoder_2", 1397 | lora_scale=self.lora_scale, 1398 | ) 1399 | 1400 | # Offload back. 1401 | if is_model_cpu_offload: 1402 | self.enable_model_cpu_offload() 1403 | elif is_sequential_cpu_offload: 1404 | self.enable_sequential_cpu_offload() 1405 | 1406 | @classmethod 1407 | def save_lora_weights( 1408 | self, 1409 | save_directory: Union[str, os.PathLike], 1410 | unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, 1411 | text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, 1412 | text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, 1413 | is_main_process: bool = True, 1414 | weight_name: str = None, 1415 | save_function: Callable = None, 1416 | safe_serialization: bool = True, 1417 | ): 1418 | state_dict = {} 1419 | 1420 | def pack_weights(layers, prefix): 1421 | layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers 1422 | layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()} 1423 | return layers_state_dict 1424 | 1425 | if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): 1426 | raise ValueError( 1427 | "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`." 1428 | ) 1429 | 1430 | if unet_lora_layers: 1431 | state_dict.update(pack_weights(unet_lora_layers, "unet")) 1432 | 1433 | if text_encoder_lora_layers and text_encoder_2_lora_layers: 1434 | state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder")) 1435 | state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) 1436 | 1437 | self.write_lora_layers( 1438 | state_dict=state_dict, 1439 | save_directory=save_directory, 1440 | is_main_process=is_main_process, 1441 | weight_name=weight_name, 1442 | save_function=save_function, 1443 | safe_serialization=safe_serialization, 1444 | ) 1445 | 1446 | def _remove_text_encoder_monkey_patch(self): 1447 | self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder) 1448 | self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2) 1449 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers~=0.21.4 2 | matplotlib 3 | --------------------------------------------------------------------------------