├── Hidiffusion_node.py ├── LICENSE ├── README.md ├── __init__.py ├── example ├── controlnet_img2img1.png ├── img2img_lora1.png ├── lightingUnet1.png ├── new.json ├── new.png └── sd15ipstyle1.png ├── guided_filter.py ├── hidiffusion ├── __init__.py ├── hidiffusion.py ├── sd_module_key │ ├── sd15_module_key.txt │ └── sdxl_module_key.txt └── utils.py ├── ip_adapter ├── __init__.py ├── attention_processor.py ├── ip_adapter.py ├── resampler.py └── utils.py ├── model.yaml ├── pyproject.toml ├── sd15_config ├── feature_extractor │ └── preprocessor_config.json ├── model_index.json ├── safety_checker │ └── config.json ├── scheduler │ └── scheduler_config.json ├── text_encoder │ └── config.json ├── tokenizer │ ├── merges.txt │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ └── vocab.json ├── unet │ └── config.json └── vae │ └── config.json ├── sdxl_config ├── model_index.json ├── scheduler │ └── scheduler_config.json ├── text_encoder │ └── config.json ├── text_encoder_2 │ └── config.json ├── tokenizer │ ├── merges.txt │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ └── vocab.json ├── tokenizer_2 │ ├── merges.txt │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ └── vocab.json ├── unet │ └── config.json ├── vae │ └── config.json ├── vae_1_0 │ └── config.json ├── vae_decoder │ └── config.json └── vae_encoder │ └── config.json └── weights ├── playground └── config.json ├── sd15 └── config.json ├── sd_xl_base.yaml └── sdxl └── config.json /Hidiffusion_node.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | import cv2 4 | import torch 5 | import os 6 | from PIL import Image 7 | import numpy as np 8 | from diffusers import (StableDiffusionXLPipeline, DiffusionPipeline, DDIMScheduler, ControlNetModel, 9 | KDPM2AncestralDiscreteScheduler, LMSDiscreteScheduler, 10 | AutoPipelineForInpainting, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, 11 | EulerDiscreteScheduler, HeunDiscreteScheduler, UNet2DConditionModel, 12 | StableDiffusionXLImg2ImgPipeline, AutoPipelineForImage2Image, 13 | AutoPipelineForText2Image, StableDiffusionXLControlNetImg2ImgPipeline, KDPM2DiscreteScheduler, 14 | EulerAncestralDiscreteScheduler, UniPCMultistepScheduler, AutoencoderKL, 15 | StableDiffusionXLControlNetPipeline, DDPMScheduler, TCDScheduler, LCMScheduler, 16 | StableDiffusionPipeline, StableDiffusionControlNetPipeline, StableDiffusionXLInpaintPipeline) 17 | try: 18 | from diffusers.loaders.single_file_utils import load_single_file_checkpoint,infer_diffusers_model_type 19 | except: 20 | from diffusers.loaders.single_file_utils import load_single_file_model_checkpoint as load_single_file_checkpoint,infer_model_type as infer_diffusers_model_type 21 | from .hidiffusion.hidiffusion import apply_hidiffusion,remove_hidiffusion 22 | import folder_paths 23 | from safetensors.torch import load_file 24 | import yaml 25 | import diffusers 26 | import random 27 | from omegaconf import OmegaConf 28 | from comfy.model_management import cleanup_models 29 | from comfy.clip_vision import load as load_clip 30 | 31 | dif_version = str(diffusers.__version__) 32 | dif_version_int = int(dif_version.split(".")[1]) 33 | if dif_version_int >= 28: 34 | from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel 35 | else: 36 | from diffusers.models.unet_2d_condition import UNet2DConditionModel 37 | from comfy.utils import common_upscale 38 | from .guided_filter import FastGuidedFilter 39 | from .ip_adapter import IPAdapterXL,IPAdapter 40 | 41 | dir_path = os.path.dirname(os.path.abspath(__file__)) 42 | path_dir = os.path.dirname(dir_path) 43 | file_path = os.path.dirname(path_dir) 44 | 45 | 46 | scheduler_list = ["DDIM", 47 | "Euler", 48 | "Euler a", 49 | "DDPM", 50 | "DPM++ 2M", 51 | "DPM++ 2M Karras", 52 | "DPM++ 2M SDE", 53 | "DPM++ 2M SDE Karras", 54 | "DPM++ SDE", 55 | "DPM++ SDE Karras", 56 | "DPM2", 57 | "DPM2 Karras", 58 | "DPM2 a", 59 | "DPM2 a Karras", 60 | "Heun", 61 | "LCM", 62 | "LMS", 63 | "LMS Karras", 64 | "UniPC", 65 | ] 66 | 67 | fs = open(os.path.join(dir_path, "model.yaml"), encoding="UTF-8") 68 | datas = yaml.load(fs, Loader=yaml.FullLoader) 69 | 70 | normal_model_list = datas["surport_model"] 71 | sdxl_lightning_list = datas["lightning_unet"] 72 | controlnet_suport = datas["surport_controlnet"] 73 | xl_model_support = datas["sdxl_model"] 74 | lightning_lora=datas["lightning_lora"] 75 | lightning_xl_lora=datas["lightning_xl_lora"] 76 | 77 | lcm_unet = ["dmd2_sdxl_4step_unet_fp16.bin", "dmd2_sdxl_1step_unet_fp16.bin", "lcm-sdxl-base-1.0.safetensors", 78 | "Hyper-SDXL-1step-Unet.safetensors"] 79 | 80 | def tensor_to_image(tensor): 81 | image_np = tensor.squeeze().mul(255).clamp(0, 255).byte().numpy() 82 | image = Image.fromarray(image_np, mode='RGB') 83 | return image 84 | 85 | def nomarl_upscale(img_tensor, width, height): 86 | samples = img_tensor.movedim(-1, 1) 87 | img = common_upscale(samples, width, height, "nearest-exact", "center") 88 | samples = img.movedim(1, -1) 89 | img_pil = tensor_to_image(samples) 90 | return img_pil 91 | 92 | def resize_image_control(control_image, resolution): 93 | HH, WW, _ = control_image.shape 94 | crop_h = random.randint(0, HH - resolution[1]) 95 | crop_w = random.randint(0, WW - resolution[0]) 96 | crop_image = control_image[crop_h:crop_h+resolution[1], crop_w:crop_w+resolution[0], :] 97 | return crop_image, crop_w, crop_h 98 | 99 | def apply_gaussian_blur(image_np, ksize=5, sigmaX=1.0): 100 | if ksize % 2 == 0: 101 | ksize += 1 # ksize must be odd 102 | blurred_image = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX) 103 | return blurred_image 104 | 105 | def apply_guided_filter(image_np, radius, eps, scale): 106 | filter = FastGuidedFilter(image_np, radius, eps, scale) 107 | return filter.filter(image_np) 108 | 109 | def input_size_adaptation_output(img_tensor,base_in, width, height): 110 | #basein=1024 111 | if width == height: 112 | img_pil = nomarl_upscale(img_tensor, base_in, base_in) # 2pil 113 | else: 114 | if min(1,width/ height)==1: #高 115 | r=height/base_in 116 | img_pil = nomarl_upscale(img_tensor, round(width/r), base_in) # 2pil 117 | else: #宽 118 | r=width/base_in 119 | img_pil = nomarl_upscale(img_tensor, base_in, round(height/r)) # 2pil 120 | return img_pil 121 | 122 | def get_sheduler(name): 123 | scheduler = False 124 | if name == "Euler": 125 | scheduler = EulerDiscreteScheduler() 126 | elif name == "Euler a": 127 | scheduler = EulerAncestralDiscreteScheduler() 128 | elif name == "DDIM": 129 | scheduler = DDIMScheduler() 130 | elif name == "DDPM": 131 | scheduler = DDPMScheduler() 132 | elif name == "DPM++ 2M": 133 | scheduler = DPMSolverMultistepScheduler() 134 | elif name == "DPM++ 2M Karras": 135 | scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True) 136 | elif name == "DPM++ 2M SDE": 137 | scheduler = DPMSolverMultistepScheduler(algorithm_type="sde-dpmsolver++") 138 | elif name == "DPM++ 2M SDE Karras": 139 | scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True, algorithm_type="sde-dpmsolver++") 140 | elif name == "DPM++ SDE": 141 | scheduler = DPMSolverSinglestepScheduler() 142 | elif name == "DPM++ SDE Karras": 143 | scheduler = DPMSolverSinglestepScheduler(use_karras_sigmas=True) 144 | elif name == "DPM2": 145 | scheduler = KDPM2DiscreteScheduler() 146 | elif name == "DPM2 Karras": 147 | scheduler = KDPM2DiscreteScheduler(use_karras_sigmas=True) 148 | elif name == "DPM2 a": 149 | scheduler = KDPM2AncestralDiscreteScheduler() 150 | elif name == "DPM2 a Karras": 151 | scheduler = KDPM2AncestralDiscreteScheduler(use_karras_sigmas=True) 152 | elif name == "Heun": 153 | scheduler = HeunDiscreteScheduler() 154 | elif name == "LCM": 155 | scheduler = LCMScheduler() 156 | elif name == "LMS": 157 | scheduler = LMSDiscreteScheduler() 158 | elif name == "LMS Karras": 159 | scheduler = LMSDiscreteScheduler(use_karras_sigmas=True) 160 | elif name == "UniPC": 161 | scheduler = UniPCMultistepScheduler() 162 | return scheduler 163 | 164 | 165 | class HI_Diffusers_Model_Loader: 166 | def __init__(self): 167 | pass 168 | 169 | @classmethod 170 | def INPUT_TYPES(cls): 171 | return { 172 | "required": { 173 | "function_choice": (["txt2img", "img2img", ],), 174 | "ckpt_name": (folder_paths.get_filename_list("checkpoints"),), 175 | "vae_id": (["none"] + folder_paths.get_filename_list("vae"),), 176 | "unet_model": (["none"] + folder_paths.get_filename_list("unet"),), 177 | "controlnet_model": (["none"] + folder_paths.get_filename_list("controlnet"),), 178 | "lora": (["none"] + folder_paths.get_filename_list("loras"),), 179 | "lora_scale": ("FLOAT", {"default": 0.8, "min": 0.1, "max": 1.0, "step": 0.1}), 180 | "trigger_words": ("STRING", {"default": "best quality"}), 181 | "scheduler": (scheduler_list,), 182 | "apply_window_attn":("BOOLEAN", {"default": False},), 183 | "ip_ckpt": (["none"] + folder_paths.get_filename_list("photomaker"),), 184 | "clip_vision": (["none"] + folder_paths.get_filename_list("clip_vision"),), 185 | 186 | } 187 | } 188 | 189 | 190 | RETURN_TYPES = ("HIDIF_MODEL", ) 191 | RETURN_NAMES = ("pipe", ) 192 | FUNCTION = "loader_models" 193 | CATEGORY = "Hidiffusion_Pro" 194 | 195 | def loader_models(self,function_choice, ckpt_name,vae_id,unet_model, controlnet_model, 196 | lora,lora_scale,trigger_words,scheduler,apply_window_attn,ip_ckpt,clip_vision): 197 | 198 | ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name) if ckpt_name!="none" else None 199 | sd_type="" 200 | if ckpt_path: 201 | sd = load_single_file_checkpoint(ckpt_path) 202 | try: 203 | sd_type = infer_diffusers_model_type(sd) 204 | del sd 205 | except: 206 | raise "diffuser need >0.27.2" 207 | 208 | 209 | vae_id=vae_id if vae_id!="none" else None 210 | controlnet_path=folder_paths.get_full_path("controlnet", controlnet_model) if controlnet_model!="none" else None 211 | unet_ckpt = folder_paths.get_full_path("unet", unet_model) if unet_model!="none" else None 212 | ip_ckpt = folder_paths.get_full_path("photomaker", ip_ckpt) if ip_ckpt != "none" else None 213 | clip_vision = folder_paths.get_full_path("clip_vision", clip_vision) if clip_vision != "none" else None 214 | 215 | scheduler_used = get_sheduler(scheduler) 216 | 217 | if sd_type == "v1" or sd_type == "v2": 218 | model_type="stable-diffusion-v1-5" 219 | model_config=os.path.join(dir_path,"sd15_config") 220 | original_config_file = os.path.join(folder_paths.models_dir, "configs", "v1-inference.yaml") 221 | if dif_version_int >= 28: 222 | model = StableDiffusionPipeline.from_single_file( 223 | ckpt_path, config=model_config,original_config=original_config_file, torch_dtype=torch.float16).to("cuda") 224 | else: 225 | model = StableDiffusionPipeline.from_single_file( 226 | ckpt_path,config=model_config, original_config_file=original_config_file, torch_dtype=torch.float16).to("cuda") 227 | 228 | elif sd_type =="playground-v2-5": 229 | model_type = "playground-v2-1024px-aesthetic" 230 | model_config ="playgroundai/playground-v2.5-1024px-aesthetic" 231 | model = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=model_config,torch_dtype=torch.float16).to("cuda") 232 | 233 | elif sd_type == "xl_inpaint": 234 | model_type ="stable-diffusion-xl-1.0-inpainting-0.1" 235 | model_config = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1" 236 | original_config_file = os.path.join(dir_path, "weights", "sd_xl_base.yaml") 237 | if dif_version_int >= 28: 238 | model = StableDiffusionXLInpaintPipeline.from_single_file(ckpt_path,config=model_config, 239 | original_config=original_config_file, 240 | torch_dtype=torch.float16, 241 | ) 242 | else: 243 | model = StableDiffusionXLInpaintPipeline.from_single_file(ckpt_path,config=model_config, 244 | original_config_file=original_config_file, 245 | torch_dtype=torch.float16, 246 | ) 247 | if unet_model in sdxl_lightning_list: 248 | if unet_model.rsplit('.', 1)[-1] == "bin": 249 | model.unet.load_state_dict(torch.load(unet_ckpt),strict=False,) 250 | else: 251 | model.unet.load_state_dict(load_file(unet_ckpt), strict=False, ) 252 | elif sd_type == "xl_base": 253 | model_type = "stable-diffusion-xl-base-1.0" 254 | model_config=os.path.join(dir_path,"sdxl_config") 255 | original_config_file = os.path.join(dir_path, "weights", "sd_xl_base.yaml") 256 | 257 | if dif_version_int >= 28: 258 | model = StableDiffusionXLPipeline.from_single_file( 259 | ckpt_path, config=model_config,original_config=original_config_file, torch_dtype=torch.float16) 260 | else: 261 | model = StableDiffusionXLPipeline.from_single_file( 262 | ckpt_path,config=model_config, original_config_file=original_config_file, torch_dtype=torch.float16) 263 | 264 | if controlnet_path: 265 | controlnet = ControlNetModel.from_unet(model.unet) 266 | cn_state_dict = load_file(controlnet_path) 267 | controlnet.load_state_dict(cn_state_dict, strict=False) 268 | controlnet.to(torch.float16) 269 | if function_choice == "img2img": 270 | model = StableDiffusionXLControlNetImg2ImgPipeline.from_pipe(model,controlnet=controlnet) 271 | else: 272 | model = StableDiffusionXLControlNetPipeline.from_pipe(model,controlnet=controlnet) 273 | 274 | if unet_model in sdxl_lightning_list: 275 | if unet_model.rsplit('.', 1)[-1] == "bin": 276 | model.unet.load_state_dict(torch.load(unet_ckpt), strict=False,) 277 | else: 278 | model.unet.load_state_dict(load_file(unet_ckpt), strict=False, ) 279 | else: 280 | raise "unsupport model!!" 281 | if vae_id: 282 | vae_id = folder_paths.get_full_path("vae", vae_id) 283 | if sd_type == "xl_base" or sd_type == "xl_inpaint": 284 | vae_config=os.path.join(dir_path,"sdxl_config","vae") 285 | elif sd_type == "v1" or sd_type == "v2" : 286 | vae_config=os.path.join(dir_path, "sd15_config","vae") 287 | elif sd_type == "playground-v2-5" : 288 | vae_config=os.path.join(dir_path,"weights/playground") 289 | else: 290 | raise "vae not support" 291 | model.vae = AutoencoderKL.from_single_file(vae_id,config=vae_config, torch_dtype=torch.float16).to("cuda") 292 | if sd_type == "xl_inpaint": 293 | model.scheduler =scheduler_used.from_pretrained(os.path.join(dir_path,"sdxl_config"), subfolder="scheduler") 294 | else: 295 | model.scheduler = scheduler_used.from_config(model.scheduler.config, timestep_spacing="trailing") 296 | 297 | if lora!="none": 298 | lora_path = folder_paths.get_full_path("loras", lora) 299 | model.load_lora_weights(lora_path, adapter_name=trigger_words) 300 | model.fuse_lora(lora_scale=lora_scale, adapter_names=[trigger_words,]) 301 | 302 | model.enable_xformers_memory_efficient_attention() 303 | model.enable_vae_tiling() 304 | apply_hidiffusion(model,apply_window_attn=apply_window_attn,model_type_str=model_type) 305 | model.enable_model_cpu_offload() # need below apply_hidiffusion(model) 306 | ip_adapter = False 307 | if ip_ckpt is not None and clip_vision is not None: 308 | model.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2) 309 | device = "cuda" 310 | remove_hidiffusion(model) 311 | image_encoder = load_clip(clip_vision) 312 | if sd_type == "xl_base": 313 | config_path=os.path.join(dir_path,"weights","sdxl","config.json") 314 | image_encoder_config = OmegaConf.load(config_path) 315 | model = IPAdapterXL(model, image_encoder, ip_ckpt, device,image_encoder_config, 316 | target_blocks=["up_blocks.0.attentions.1"]) 317 | elif sd_type == "v1": 318 | config_path = os.path.join(dir_path, "weights", "sd15","config.json") 319 | image_encoder_config = OmegaConf.load(config_path) 320 | model = IPAdapter(model, image_encoder, ip_ckpt, device,image_encoder_config, target_blocks=["block"]) 321 | else: 322 | raise "unsupport model,only support SDXL or SD1.5" 323 | torch.cuda.empty_cache() 324 | ip_adapter=True 325 | 326 | torch.cuda.empty_cache() 327 | pipe={"model":model,"controlnet_path":controlnet_path,"sd_type":sd_type,"lora":lora,"trigger_words":trigger_words,"ip_adapter":ip_adapter,"function_choice":function_choice} 328 | torch.cuda.empty_cache() 329 | return (pipe,) 330 | 331 | 332 | class Hi_Sampler: 333 | def __init__(self): 334 | pass 335 | 336 | @classmethod 337 | def INPUT_TYPES(cls): 338 | return { 339 | "required": { 340 | "pipe": ("HIDIF_MODEL",), 341 | "prompt": ("STRING", {"multiline": True, 342 | "default": "a girl,8k,smile,best quality"}), 343 | "negative_prompt": ("STRING", {"multiline": True, 344 | "default": "text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"}), 345 | "controlnet_scale": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 1.0, "step": 0.1}), 346 | "clip_skip": ("INT", {"default": 1, "min": -5, "max": 100,"step": 1}), 347 | "pre_input": ("INT", {"default": 512, "min": 256, "max": 1024, "step": 64}), 348 | "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), 349 | "steps": ("INT", {"default": 30, "min": 1, "max": 10000}), 350 | "cfg": ("FLOAT", {"default": 7.5, "min": 0.0, "max": 100.0, "step": 0.1, "round": 0.01}), 351 | "width": ("INT", {"default": 2048, "min": 64, "max": 8192, "step": 64, "display": "number"}), 352 | "height": ("INT", {"default": 2048, "min": 64, "max": 8192, "step": 64, "display": "number"}), 353 | "adapter_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.1,}), 354 | }, 355 | "optional": {"image": ("IMAGE",), 356 | "control_image": ("IMAGE",), 357 | "ip_image": ("IMAGE",)} 358 | } 359 | 360 | RETURN_TYPES = ("IMAGE",) 361 | RETURN_NAMES = ("image",) 362 | FUNCTION = "hi_sampler" 363 | CATEGORY = "Hidiffusion_Pro" 364 | 365 | 366 | def hi_sampler(self, pipe, prompt, negative_prompt,controlnet_scale,clip_skip,pre_input, 367 | seed,steps, cfg, width,height,adapter_scale,**kwargs): 368 | model=pipe.get("model",None) 369 | controlnet_path = pipe["controlnet_path"] 370 | sd_type = pipe["sd_type"] 371 | lora = pipe["lora"] 372 | trigger_words = pipe["trigger_words"] 373 | ip_adapter = pipe["ip_adapter"] 374 | function_choice =pipe["function_choice"] 375 | 376 | if ip_adapter: 377 | ip_image = kwargs.get("ip_image") 378 | #ip_image = input_size_adaptation_output(ip_image, pre_input, width, height) 379 | if lora != "none": 380 | prompt = prompt + " " + trigger_words 381 | if controlnet_path is None: 382 | if function_choice == "img2img": 383 | image = kwargs.get("image") 384 | image = input_size_adaptation_output(image, pre_input, width, height) 385 | images = \ 386 | model.generate(prompt=prompt, negative_prompt=negative_prompt,pil_image=ip_image, image=image, scale=adapter_scale,num_inference_steps=steps, 387 | guidance_scale=cfg, clip_skip=clip_skip, 388 | height=height, width=width, seed=seed, ) 389 | else: 390 | images = \ 391 | model.generate(pil_image=ip_image,prompt=prompt, negative_prompt=negative_prompt,scale=adapter_scale, num_inference_steps=steps, 392 | guidance_scale=cfg, clip_skip=clip_skip, 393 | height=height, width=width, seed=seed, ) 394 | 395 | else: 396 | control_image = kwargs.get("control_image") 397 | if "tile" in controlnet_path: 398 | control_image = input_size_adaptation_output(control_image, pre_input, width, height) 399 | controlnet_img = cv2.cvtColor(np.asarray(control_image), cv2.COLOR_RGB2BGR) 400 | new_height, new_width, _ = controlnet_img.shape 401 | ratio = np.sqrt(1024. * 1024. / (new_width * new_height)) 402 | W, H = int(new_width * ratio), int(new_height * ratio) 403 | 404 | crop_w, crop_h = 0, 0 405 | controlnet_img = cv2.resize(controlnet_img, (W, H)) 406 | 407 | blur_strength = random.sample([i / 10. for i in range(10, 201, 2)], k=1)[0] 408 | radius = random.sample([i for i in range(1, 40, 2)], k=1)[0] 409 | eps = random.sample([i / 1000. for i in range(1, 101, 2)], k=1)[0] 410 | scale_factor = random.sample([i / 10. for i in range(10, 181, 5)], k=1)[0] 411 | 412 | if random.random() > 0.5: 413 | controlnet_img = apply_gaussian_blur(controlnet_img, ksize=int(blur_strength), 414 | sigmaX=blur_strength / 2) 415 | 416 | if random.random() > 0.5: 417 | # Apply Guided Filter 418 | controlnet_img = apply_guided_filter(controlnet_img, radius, eps, scale_factor) 419 | 420 | # Resize image 421 | controlnet_img = cv2.resize(controlnet_img, (int(W / scale_factor), int(H / scale_factor)), 422 | interpolation=cv2.INTER_AREA) 423 | controlnet_img = cv2.resize(controlnet_img, (W, H), interpolation=cv2.INTER_CUBIC) 424 | 425 | controlnet_img = cv2.cvtColor(controlnet_img, cv2.COLOR_BGR2RGB) 426 | control_image = Image.fromarray(controlnet_img) 427 | else: 428 | control_image = input_size_adaptation_output(control_image, pre_input, width, height) 429 | if function_choice == "img2img": 430 | image = kwargs["image"] 431 | image = input_size_adaptation_output(image, pre_input, width, height) 432 | if sd_type == "xl_inpaint": 433 | images = model.generate(prompt=prompt, negative_prompt=negative_prompt, image=image,pil_image=ip_image, scale=adapter_scale,mask_image=control_image, 434 | num_inference_steps=steps, guidance_scale=cfg, height=height, clip_skip=clip_skip, 435 | width=width, controlnet_conditioning_scale=controlnet_scale, 436 | seed=seed, ) 437 | else: 438 | images = model.generate(prompt=prompt, negative_prompt=negative_prompt, image=image, pil_image=ip_image,scale=adapter_scale,control_image=control_image, 439 | num_inference_steps=steps, guidance_scale=cfg, height=height, width=width, 440 | clip_skip=clip_skip, 441 | controlnet_conditioning_scale=controlnet_scale, 442 | seed=seed, ) 443 | else: 444 | images = model.generate(prompt=prompt, negative_prompt=negative_prompt, pil_image=ip_image,scale=adapter_scale,control_image=control_image, 445 | num_inference_steps=steps, guidance_scale=cfg, height=height, width=width, 446 | clip_skip=clip_skip, 447 | controlnet_conditioning_scale=controlnet_scale, 448 | seed=seed, ) 449 | images = images[0] 450 | else: 451 | if lora != "none": 452 | prompt = prompt + " " + trigger_words 453 | # print(model_type, unet_model, control_net, function_choice) 454 | if controlnet_path is None: 455 | if function_choice == "img2img": 456 | image = kwargs["image"] 457 | image = input_size_adaptation_output(image, pre_input, width, height) 458 | images = \ 459 | model(prompt, negative_prompt=negative_prompt, image=image, num_inference_steps=steps, 460 | guidance_scale=cfg, clip_skip=clip_skip, 461 | height=height, width=width, seed=seed, ).images[0] 462 | else: 463 | images = \ 464 | model(prompt, negative_prompt=negative_prompt, num_inference_steps=steps, 465 | guidance_scale=cfg, clip_skip=clip_skip, 466 | height=height, width=width, seed=seed, ).images[0] 467 | else: 468 | control_image = kwargs["control_image"] 469 | if "tile" in controlnet_path: 470 | control_image = input_size_adaptation_output(control_image, pre_input, width, height) 471 | controlnet_img = cv2.cvtColor(np.asarray(control_image), cv2.COLOR_RGB2BGR) 472 | new_height, new_width, _ = controlnet_img.shape 473 | ratio = np.sqrt(1024. * 1024. / (new_width * new_height)) 474 | W, H = int(new_width * ratio), int(new_height * ratio) 475 | 476 | crop_w, crop_h = 0, 0 477 | controlnet_img = cv2.resize(controlnet_img, (W, H)) 478 | 479 | blur_strength = random.sample([i / 10. for i in range(10, 201, 2)], k=1)[0] 480 | radius = random.sample([i for i in range(1, 40, 2)], k=1)[0] 481 | eps = random.sample([i / 1000. for i in range(1, 101, 2)], k=1)[0] 482 | scale_factor = random.sample([i / 10. for i in range(10, 181, 5)], k=1)[0] 483 | 484 | if random.random() > 0.5: 485 | controlnet_img = apply_gaussian_blur(controlnet_img, ksize=int(blur_strength), 486 | sigmaX=blur_strength / 2) 487 | 488 | if random.random() > 0.5: 489 | # Apply Guided Filter 490 | controlnet_img = apply_guided_filter(controlnet_img, radius, eps, scale_factor) 491 | 492 | # Resize image 493 | controlnet_img = cv2.resize(controlnet_img, (int(W / scale_factor), int(H / scale_factor)), 494 | interpolation=cv2.INTER_AREA) 495 | controlnet_img = cv2.resize(controlnet_img, (W, H), interpolation=cv2.INTER_CUBIC) 496 | 497 | controlnet_img = cv2.cvtColor(controlnet_img, cv2.COLOR_BGR2RGB) 498 | control_image = Image.fromarray(controlnet_img) 499 | else: 500 | control_image = input_size_adaptation_output(control_image, pre_input, width, height) 501 | 502 | if function_choice == "img2img": 503 | image = kwargs["image"] 504 | image = input_size_adaptation_output(image, pre_input, width, height) 505 | if sd_type == "xl_inpaint": 506 | print("controlnet inpainting") 507 | images = \ 508 | model(prompt, negative_prompt=negative_prompt, image=image, mask_image=control_image, 509 | num_inference_steps=steps, guidance_scale=cfg, height=height, clip_skip=clip_skip, 510 | width=width, controlnet_conditioning_scale=controlnet_scale, 511 | seed=seed, ).images[0] 512 | else: 513 | print("controlnet img2img") 514 | images = model(prompt, negative_prompt=negative_prompt, image=image, control_image=control_image, 515 | num_inference_steps=steps, guidance_scale=cfg, height=height, width=width, 516 | clip_skip=clip_skip, 517 | controlnet_conditioning_scale=controlnet_scale, 518 | seed=seed, ).images[0] 519 | else: 520 | print("controlnet txt2img") 521 | images = model(prompt,control_image=control_image, negative_prompt=negative_prompt, 522 | num_inference_steps=steps, guidance_scale=cfg, height=height, width=width, 523 | clip_skip=clip_skip, 524 | controlnet_conditioning_scale=controlnet_scale, 525 | seed=seed, ).images[0] 526 | 527 | 528 | output_image = torch.from_numpy(np.array(images).astype(np.float32) / 255.0).unsqueeze(0) 529 | if lora != "none": 530 | if ip_adapter is None: 531 | model.unfuse_lora() 532 | torch.cuda.empty_cache() 533 | return (output_image,) 534 | 535 | 536 | NODE_CLASS_MAPPINGS = { 537 | "HI_Diffusers_Model_Loader": HI_Diffusers_Model_Loader, 538 | "Hi_Sampler": Hi_Sampler 539 | } 540 | 541 | NODE_DISPLAY_NAME_MAPPINGS = { 542 | "HI_Diffusers_Model_Loader": "HI_Diffusers_Model_Loader", 543 | "Hi_Hi_Sampler": "Hi_Sampler" 544 | } 545 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI_HiDiffusion_Pro 2 | A HiDiffusion node for ComfyUI 3 | 4 | HiDiffusion From: [link](https://github.com/megvii-research/HiDiffusion) 5 | ---- 6 | 7 | Update 8 | ---- 9 | 10 | **09/08** 11 | * adapter style now using single file. 12 | * adapter style改成单体文件模式 13 | 14 | 15 | **Previous updates** 16 | *修复runwaybug / 去掉repo加载模型的方式 /自动选择模型的类别 17 | * 增加adapter_style支持,SDXL需求的显存较大,虽然能跑CPU,但是不推荐,会爆显存,SD1.5测试没问题。 18 | * 增加 manne加速Lora 19 | * 加入controlnet-tile-sdxl的支持,内置图片预处理,默认512尺寸,新增apply_window_attn 条件控制。 20 | * 修复节点连接逻辑,现在文生图模式,无需接入image,无controlnet也无需接入control_image 21 | * 支持SDXL-lighting\Hyper\LCM\DMD2\的加速Unet,建议适当提高步数; 22 | * 基于官方的更新,加入lora支持,需要填关键词; 23 | * 加入skip,去掉意义不大的其他参数; 24 | 25 | *fix runway error/del repo /auto choice model type 26 | * Adding adapter_style support, SDXL requires a large amount of graphics memory. Although it can run on CPU, it is not recommended as it may cause graphics memory " explosion". SD1.5 testing is not a problem. 27 | * add manne lighting lora 28 | * Added support for control net file sdxl, built-in image preprocessing, default size of 512, and added condition control for apply_window_attn. 29 | * Fix node connection logic, now in text-based graphics mode, there is no need to connect to image, no controllnet, and no need to connect to controll_image 30 | * Support acceleration Unet for SDXL lighting, Hyper, LCM, and DMD2. It is recommended to increase the number of steps appropriately; 31 | * Based on official updates, adding support for Lora requires filling in keywords; 32 | * Add skip and remove other parameters that are not significant; 33 | 34 | 35 | 1.Installation 36 | ----- 37 | 1.1 In the .\ComfyUI \ custom_node directory, run the following: 38 | 39 | ``` python 40 | git clone https://github.com/smthemex/ComfyUI_HiDiffusion_Pro.git 41 | ``` 42 | 1.2 using it 43 | 44 | 2.requirements 45 | ---- 46 | diffusers >=0.28.0 #is best 47 | yaml 48 | 49 | 3 About models 50 | ---- 51 | 3.1 base ckpt 52 | ``` 53 | ├──comfyUI/models/checkpoints/ 54 | | ├──sd1.5 or sd2.1 or sdxl or playground 55 | ├──comfyUI/models/vae/ 56 | | ├──any vae fit ckpt 57 | ``` 58 | 3.2 if using SDXL controlnet 59 | ``` 60 | ├──comfyUI/models/controlnet/ 61 | | ├──any SDXL controlnet 62 | ``` 63 | 3.3 if using lighting Unet 64 | ``` 65 | ├──comfyUI/models/unet/ 66 | | ├──any SDXL lighting Unet 67 | ``` 68 | 3.4 if using adapter style 69 | 70 | ``` 71 | ├── ComfyUI/models/photomaker 72 | | ├── ip-adapter_sd15.bin 73 | | ├── ip-adapter_sdxl.bin 74 | ├── ComfyUI/models/clip_vision 75 | | ├── sdxl_model.safetensors # rename from sd15/ncoder/model.safetensors 76 | | ├── sdm_model.safetensors # rename from sdxl/encoder/model.safetensors 77 | 78 | ``` 79 | 80 | 4 example 81 | ----- 82 | 83 | new workflow example new 84 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/new.png) 85 | 86 | sd1.5 using ip_adapter_style 使用ip_adapter_style 87 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/sd15ipstyle1.png) 88 | 89 | img2img use lora 图生图和lora 90 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/img2img_lora1.png) 91 | 92 | 93 | img2img + controlnet 图生图加controlnet 94 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/controlnet_img2img1.png) 95 | 96 | img2img use Hyper unet 图生图加加速unet 97 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/lightingUnet1.png) 98 | 99 | 6 Citation 100 | ------ 101 | 102 | ``` python 103 | @article{zhang2023hidiffusion, 104 | title={HiDiffusion: Unlocking Higher-Resolution Creativity and Efficiency in Pretrained Diffusion Models}, 105 | author={Zhang, Shen and Chen, Zhaowei and Zhao, Zhenyu and Chen, Yuhao and Tang, Yao and Liang, Jiajun}, 106 | journal={arXiv preprint arXiv:2311.17528}, 107 | year={2023} 108 | } 109 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | python = sys.executable 4 | 5 | from .Hidiffusion_node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 6 | 7 | 8 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] 9 | -------------------------------------------------------------------------------- /example/controlnet_img2img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/controlnet_img2img1.png -------------------------------------------------------------------------------- /example/img2img_lora1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/img2img_lora1.png -------------------------------------------------------------------------------- /example/lightingUnet1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/lightingUnet1.png -------------------------------------------------------------------------------- /example/new.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 9, 3 | "last_link_id": 10, 4 | "nodes": [ 5 | { 6 | "id": 3, 7 | "type": "SaveImage", 8 | "pos": [ 9 | 4311, 10 | -264 11 | ], 12 | "size": { 13 | "0": 367.2918701171875, 14 | "1": 381.46820068359375 15 | }, 16 | "flags": {}, 17 | "order": 3, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "images", 22 | "type": "IMAGE", 23 | "link": 9, 24 | "label": "images" 25 | } 26 | ], 27 | "properties": {}, 28 | "widgets_values": [ 29 | "ComfyUI" 30 | ] 31 | }, 32 | { 33 | "id": 6, 34 | "type": "LoadImage", 35 | "pos": [ 36 | 3473, 37 | 10 38 | ], 39 | "size": { 40 | "0": 315, 41 | "1": 314 42 | }, 43 | "flags": {}, 44 | "order": 0, 45 | "mode": 0, 46 | "outputs": [ 47 | { 48 | "name": "IMAGE", 49 | "type": "IMAGE", 50 | "links": [ 51 | 10 52 | ], 53 | "shape": 3, 54 | "label": "IMAGE", 55 | "slot_index": 0 56 | }, 57 | { 58 | "name": "MASK", 59 | "type": "MASK", 60 | "links": null, 61 | "shape": 3, 62 | "label": "MASK" 63 | } 64 | ], 65 | "properties": { 66 | "Node name for S&R": "LoadImage" 67 | }, 68 | "widgets_values": [ 69 | "4.jpg", 70 | "image" 71 | ] 72 | }, 73 | { 74 | "id": 8, 75 | "type": "HI_Diffusers_Model_Loader", 76 | "pos": [ 77 | 3472, 78 | -365 79 | ], 80 | "size": { 81 | "0": 315, 82 | "1": 322 83 | }, 84 | "flags": {}, 85 | "order": 1, 86 | "mode": 0, 87 | "outputs": [ 88 | { 89 | "name": "pipe", 90 | "type": "HIDIF_MODEL", 91 | "links": [ 92 | 7 93 | ], 94 | "slot_index": 0, 95 | "shape": 3, 96 | "label": "pipe" 97 | } 98 | ], 99 | "properties": { 100 | "Node name for S&R": "HI_Diffusers_Model_Loader" 101 | }, 102 | "widgets_values": [ 103 | "img2img", 104 | "0SDXL\\juggernautXL_v8Rundiffusion.safetensors", 105 | "none", 106 | "sdxl_lightning_4step_unet.safetensors", 107 | "none", 108 | "none", 109 | 0.8, 110 | "best quality", 111 | "Euler", 112 | false, 113 | "none", 114 | "none" 115 | ] 116 | }, 117 | { 118 | "id": 9, 119 | "type": "Hi_Sampler", 120 | "pos": [ 121 | 3875, 122 | -292 123 | ], 124 | "size": { 125 | "0": 400, 126 | "1": 426 127 | }, 128 | "flags": {}, 129 | "order": 2, 130 | "mode": 0, 131 | "inputs": [ 132 | { 133 | "name": "pipe", 134 | "type": "HIDIF_MODEL", 135 | "link": 7, 136 | "label": "pipe" 137 | }, 138 | { 139 | "name": "image", 140 | "type": "IMAGE", 141 | "link": 10, 142 | "label": "image" 143 | }, 144 | { 145 | "name": "control_image", 146 | "type": "IMAGE", 147 | "link": null, 148 | "label": "control_image" 149 | }, 150 | { 151 | "name": "ip_image", 152 | "type": "IMAGE", 153 | "link": null, 154 | "label": "ip_image" 155 | } 156 | ], 157 | "outputs": [ 158 | { 159 | "name": "image", 160 | "type": "IMAGE", 161 | "links": [ 162 | 9 163 | ], 164 | "slot_index": 0, 165 | "shape": 3, 166 | "label": "image" 167 | } 168 | ], 169 | "properties": { 170 | "Node name for S&R": "Hi_Sampler" 171 | }, 172 | "widgets_values": [ 173 | "a girl,8k,smile,best quality", 174 | "text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", 175 | 0.5, 176 | 1, 177 | 512, 178 | 1108821181920656, 179 | "randomize", 180 | 12, 181 | 3, 182 | 2048, 183 | 2048, 184 | 1 185 | ] 186 | } 187 | ], 188 | "links": [ 189 | [ 190 | 7, 191 | 8, 192 | 0, 193 | 9, 194 | 0, 195 | "HIDIF_MODEL" 196 | ], 197 | [ 198 | 9, 199 | 9, 200 | 0, 201 | 3, 202 | 0, 203 | "IMAGE" 204 | ], 205 | [ 206 | 10, 207 | 6, 208 | 0, 209 | 9, 210 | 1, 211 | "IMAGE" 212 | ] 213 | ], 214 | "groups": [], 215 | "config": {}, 216 | "extra": { 217 | "ds": { 218 | "scale": 1.0610764609500176, 219 | "offset": [ 220 | -3283.5070365830798, 221 | 461.2881609587324 222 | ] 223 | } 224 | }, 225 | "version": 0.4 226 | } -------------------------------------------------------------------------------- /example/new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/new.png -------------------------------------------------------------------------------- /example/sd15ipstyle1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/sd15ipstyle1.png -------------------------------------------------------------------------------- /guided_filter.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | ## @package guided_filter.core.filters 4 | # 5 | # Implementation of guided filter. 6 | # * GuidedFilter: Original guided filter. 7 | # * FastGuidedFilter: Fast version of the guided filter. 8 | # @author tody 9 | # @date 2015/08/26 10 | 11 | import numpy as np 12 | import cv2 13 | 14 | ## Convert image into float32 type. 15 | def to32F(img): 16 | if img.dtype == np.float32: 17 | return img 18 | return (1.0 / 255.0) * np.float32(img) 19 | 20 | ## Convert image into uint8 type. 21 | def to8U(img): 22 | if img.dtype == np.uint8: 23 | return img 24 | return np.clip(np.uint8(255.0 * img), 0, 255) 25 | 26 | ## Return if the input image is gray or not. 27 | def _isGray(I): 28 | return len(I.shape) == 2 29 | 30 | 31 | ## Return down sampled image. 32 | # @param scale (w/s, h/s) image will be created. 33 | # @param shape I.shape[:2]=(h, w). numpy friendly size parameter. 34 | def _downSample(I, scale=4, shape=None): 35 | if shape is not None: 36 | h, w = shape 37 | return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST) 38 | 39 | h, w = I.shape[:2] 40 | return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST) 41 | 42 | 43 | ## Return up sampled image. 44 | # @param scale (w*s, h*s) image will be created. 45 | # @param shape I.shape[:2]=(h, w). numpy friendly size parameter. 46 | def _upSample(I, scale=2, shape=None): 47 | if shape is not None: 48 | h, w = shape 49 | return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR) 50 | 51 | h, w = I.shape[:2] 52 | return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR) 53 | 54 | ## Fast guide filter. 55 | class FastGuidedFilter: 56 | ## Constructor. 57 | # @param I Input guidance image. Color or gray. 58 | # @param radius Radius of Guided Filter. 59 | # @param epsilon Regularization term of Guided Filter. 60 | # @param scale Down sampled scale. 61 | def __init__(self, I, radius=5, epsilon=0.4, scale=4): 62 | I_32F = to32F(I) 63 | self._I = I_32F 64 | h, w = I.shape[:2] 65 | 66 | I_sub = _downSample(I_32F, scale) 67 | 68 | self._I_sub = I_sub 69 | radius = int(radius / scale) 70 | 71 | if _isGray(I): 72 | self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon) 73 | else: 74 | self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon) 75 | 76 | ## Apply filter for the input image. 77 | # @param p Input image for the filtering. 78 | def filter(self, p): 79 | p_32F = to32F(p) 80 | shape_original = p.shape[:2] 81 | 82 | p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2]) 83 | 84 | if _isGray(p_sub): 85 | return self._filterGray(p_sub, shape_original) 86 | 87 | cs = p.shape[2] 88 | q = np.array(p_32F) 89 | 90 | for ci in range(cs): 91 | q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original) 92 | return to8U(q) 93 | 94 | def _filterGray(self, p_sub, shape_original): 95 | ab_sub = self._guided_filter._computeCoefficients(p_sub) 96 | ab = [_upSample(abi, shape=shape_original) for abi in ab_sub] 97 | return self._guided_filter._computeOutput(ab, self._I) 98 | 99 | 100 | ## Guide filter. 101 | class GuidedFilter: 102 | ## Constructor. 103 | # @param I Input guidance image. Color or gray. 104 | # @param radius Radius of Guided Filter. 105 | # @param epsilon Regularization term of Guided Filter. 106 | def __init__(self, I, radius=5, epsilon=0.4): 107 | I_32F = to32F(I) 108 | 109 | if _isGray(I): 110 | self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon) 111 | else: 112 | self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon) 113 | 114 | ## Apply filter for the input image. 115 | # @param p Input image for the filtering. 116 | def filter(self, p): 117 | return to8U(self._guided_filter.filter(p)) 118 | 119 | 120 | ## Common parts of guided filter. 121 | # 122 | # This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor. 123 | # Based on guided_filter._computeCoefficients, guided_filter._computeOutput, 124 | # GuidedFilterCommon.filter computes filtered image for color and gray. 125 | class GuidedFilterCommon: 126 | def __init__(self, guided_filter): 127 | self._guided_filter = guided_filter 128 | 129 | ## Apply filter for the input image. 130 | # @param p Input image for the filtering. 131 | def filter(self, p): 132 | p_32F = to32F(p) 133 | if _isGray(p_32F): 134 | return self._filterGray(p_32F) 135 | 136 | cs = p.shape[2] 137 | q = np.array(p_32F) 138 | 139 | for ci in range(cs): 140 | q[:, :, ci] = self._filterGray(p_32F[:, :, ci]) 141 | return q 142 | 143 | def _filterGray(self, p): 144 | ab = self._guided_filter._computeCoefficients(p) 145 | return self._guided_filter._computeOutput(ab, self._guided_filter._I) 146 | 147 | 148 | ## Guided filter for gray guidance image. 149 | class GuidedFilterGray: 150 | # @param I Input gray guidance image. 151 | # @param radius Radius of Guided Filter. 152 | # @param epsilon Regularization term of Guided Filter. 153 | def __init__(self, I, radius=5, epsilon=0.4): 154 | self._radius = 2 * radius + 1 155 | self._epsilon = epsilon 156 | self._I = to32F(I) 157 | self._initFilter() 158 | self._filter_common = GuidedFilterCommon(self) 159 | 160 | ## Apply filter for the input image. 161 | # @param p Input image for the filtering. 162 | def filter(self, p): 163 | return self._filter_common.filter(p) 164 | 165 | def _initFilter(self): 166 | I = self._I 167 | r = self._radius 168 | self._I_mean = cv2.blur(I, (r, r)) 169 | I_mean_sq = cv2.blur(I ** 2, (r, r)) 170 | self._I_var = I_mean_sq - self._I_mean ** 2 171 | 172 | def _computeCoefficients(self, p): 173 | r = self._radius 174 | p_mean = cv2.blur(p, (r, r)) 175 | p_cov = p_mean - self._I_mean * p_mean 176 | a = p_cov / (self._I_var + self._epsilon) 177 | b = p_mean - a * self._I_mean 178 | a_mean = cv2.blur(a, (r, r)) 179 | b_mean = cv2.blur(b, (r, r)) 180 | return a_mean, b_mean 181 | 182 | def _computeOutput(self, ab, I): 183 | a_mean, b_mean = ab 184 | return a_mean * I + b_mean 185 | 186 | 187 | ## Guided filter for color guidance image. 188 | class GuidedFilterColor: 189 | # @param I Input color guidance image. 190 | # @param radius Radius of Guided Filter. 191 | # @param epsilon Regularization term of Guided Filter. 192 | def __init__(self, I, radius=5, epsilon=0.2): 193 | self._radius = 2 * radius + 1 194 | self._epsilon = epsilon 195 | self._I = to32F(I) 196 | self._initFilter() 197 | self._filter_common = GuidedFilterCommon(self) 198 | 199 | ## Apply filter for the input image. 200 | # @param p Input image for the filtering. 201 | def filter(self, p): 202 | return self._filter_common.filter(p) 203 | 204 | def _initFilter(self): 205 | I = self._I 206 | r = self._radius 207 | eps = self._epsilon 208 | 209 | Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2] 210 | 211 | self._Ir_mean = cv2.blur(Ir, (r, r)) 212 | self._Ig_mean = cv2.blur(Ig, (r, r)) 213 | self._Ib_mean = cv2.blur(Ib, (r, r)) 214 | 215 | Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps 216 | Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean 217 | Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean 218 | Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps 219 | Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean 220 | Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps 221 | 222 | Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var 223 | Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var 224 | Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var 225 | Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var 226 | Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var 227 | Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var 228 | 229 | I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var 230 | Irr_inv /= I_cov 231 | Irg_inv /= I_cov 232 | Irb_inv /= I_cov 233 | Igg_inv /= I_cov 234 | Igb_inv /= I_cov 235 | Ibb_inv /= I_cov 236 | 237 | self._Irr_inv = Irr_inv 238 | self._Irg_inv = Irg_inv 239 | self._Irb_inv = Irb_inv 240 | self._Igg_inv = Igg_inv 241 | self._Igb_inv = Igb_inv 242 | self._Ibb_inv = Ibb_inv 243 | 244 | def _computeCoefficients(self, p): 245 | r = self._radius 246 | I = self._I 247 | Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2] 248 | 249 | p_mean = cv2.blur(p, (r, r)) 250 | 251 | Ipr_mean = cv2.blur(Ir * p, (r, r)) 252 | Ipg_mean = cv2.blur(Ig * p, (r, r)) 253 | Ipb_mean = cv2.blur(Ib * p, (r, r)) 254 | 255 | Ipr_cov = Ipr_mean - self._Ir_mean * p_mean 256 | Ipg_cov = Ipg_mean - self._Ig_mean * p_mean 257 | Ipb_cov = Ipb_mean - self._Ib_mean * p_mean 258 | 259 | ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov 260 | ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov 261 | ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov 262 | b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean 263 | 264 | ar_mean = cv2.blur(ar, (r, r)) 265 | ag_mean = cv2.blur(ag, (r, r)) 266 | ab_mean = cv2.blur(ab, (r, r)) 267 | b_mean = cv2.blur(b, (r, r)) 268 | 269 | return ar_mean, ag_mean, ab_mean, b_mean 270 | 271 | def _computeOutput(self, ab, I): 272 | ar_mean, ag_mean, ab_mean, b_mean = ab 273 | 274 | Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2] 275 | 276 | q = (ar_mean * Ir + 277 | ag_mean * Ig + 278 | ab_mean * Ib + 279 | b_mean) 280 | 281 | return q -------------------------------------------------------------------------------- /hidiffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from .hidiffusion import apply_hidiffusion, remove_hidiffusion 2 | 3 | __all__ = ["apply_hidiffusion", "remove_hidiffusion"] 4 | -------------------------------------------------------------------------------- /hidiffusion/sd_module_key/sd15_module_key.txt: -------------------------------------------------------------------------------- 1 | conv_in 2 | time_proj 3 | time_embedding 4 | time_embedding.linear_1 5 | time_embedding.act 6 | time_embedding.linear_2 7 | down_blocks 8 | down_blocks.0 9 | down_blocks.0.attentions 10 | down_blocks.0.attentions.0 11 | down_blocks.0.attentions.0.norm 12 | down_blocks.0.attentions.0.proj_in 13 | down_blocks.0.attentions.0.transformer_blocks 14 | down_blocks.0.attentions.0.transformer_blocks.0 15 | down_blocks.0.attentions.0.transformer_blocks.0.norm1 16 | down_blocks.0.attentions.0.transformer_blocks.0.attn1 17 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q 18 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k 19 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v 20 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out 21 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0 22 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.1 23 | down_blocks.0.attentions.0.transformer_blocks.0.norm2 24 | down_blocks.0.attentions.0.transformer_blocks.0.attn2 25 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q 26 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k 27 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v 28 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out 29 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0 30 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.1 31 | down_blocks.0.attentions.0.transformer_blocks.0.norm3 32 | down_blocks.0.attentions.0.transformer_blocks.0.ff 33 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net 34 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0 35 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj 36 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.1 37 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2 38 | down_blocks.0.attentions.0.proj_out 39 | down_blocks.0.attentions.1 40 | down_blocks.0.attentions.1.norm 41 | down_blocks.0.attentions.1.proj_in 42 | down_blocks.0.attentions.1.transformer_blocks 43 | down_blocks.0.attentions.1.transformer_blocks.0 44 | down_blocks.0.attentions.1.transformer_blocks.0.norm1 45 | down_blocks.0.attentions.1.transformer_blocks.0.attn1 46 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q 47 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k 48 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v 49 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out 50 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0 51 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.1 52 | down_blocks.0.attentions.1.transformer_blocks.0.norm2 53 | down_blocks.0.attentions.1.transformer_blocks.0.attn2 54 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q 55 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k 56 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v 57 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out 58 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0 59 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.1 60 | down_blocks.0.attentions.1.transformer_blocks.0.norm3 61 | down_blocks.0.attentions.1.transformer_blocks.0.ff 62 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net 63 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0 64 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj 65 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.1 66 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2 67 | down_blocks.0.attentions.1.proj_out 68 | down_blocks.0.resnets 69 | down_blocks.0.resnets.0 70 | down_blocks.0.resnets.0.norm1 71 | down_blocks.0.resnets.0.conv1 72 | down_blocks.0.resnets.0.time_emb_proj 73 | down_blocks.0.resnets.0.norm2 74 | down_blocks.0.resnets.0.dropout 75 | down_blocks.0.resnets.0.conv2 76 | down_blocks.0.resnets.1 77 | down_blocks.0.resnets.1.norm1 78 | down_blocks.0.resnets.1.conv1 79 | down_blocks.0.resnets.1.time_emb_proj 80 | down_blocks.0.resnets.1.norm2 81 | down_blocks.0.resnets.1.dropout 82 | down_blocks.0.resnets.1.conv2 83 | down_blocks.0.downsamplers 84 | down_blocks.0.downsamplers.0 85 | down_blocks.0.downsamplers.0.conv 86 | down_blocks.1 87 | down_blocks.1.attentions 88 | down_blocks.1.attentions.0 89 | down_blocks.1.attentions.0.norm 90 | down_blocks.1.attentions.0.proj_in 91 | down_blocks.1.attentions.0.transformer_blocks 92 | down_blocks.1.attentions.0.transformer_blocks.0 93 | down_blocks.1.attentions.0.transformer_blocks.0.norm1 94 | down_blocks.1.attentions.0.transformer_blocks.0.attn1 95 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q 96 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k 97 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v 98 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out 99 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0 100 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.1 101 | down_blocks.1.attentions.0.transformer_blocks.0.norm2 102 | down_blocks.1.attentions.0.transformer_blocks.0.attn2 103 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q 104 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k 105 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v 106 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out 107 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0 108 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.1 109 | down_blocks.1.attentions.0.transformer_blocks.0.norm3 110 | down_blocks.1.attentions.0.transformer_blocks.0.ff 111 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net 112 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0 113 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj 114 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.1 115 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2 116 | down_blocks.1.attentions.0.proj_out 117 | down_blocks.1.attentions.1 118 | down_blocks.1.attentions.1.norm 119 | down_blocks.1.attentions.1.proj_in 120 | down_blocks.1.attentions.1.transformer_blocks 121 | down_blocks.1.attentions.1.transformer_blocks.0 122 | down_blocks.1.attentions.1.transformer_blocks.0.norm1 123 | down_blocks.1.attentions.1.transformer_blocks.0.attn1 124 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q 125 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k 126 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v 127 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out 128 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0 129 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.1 130 | down_blocks.1.attentions.1.transformer_blocks.0.norm2 131 | down_blocks.1.attentions.1.transformer_blocks.0.attn2 132 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q 133 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k 134 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v 135 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out 136 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0 137 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.1 138 | down_blocks.1.attentions.1.transformer_blocks.0.norm3 139 | down_blocks.1.attentions.1.transformer_blocks.0.ff 140 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net 141 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0 142 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj 143 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.1 144 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2 145 | down_blocks.1.attentions.1.proj_out 146 | down_blocks.1.resnets 147 | down_blocks.1.resnets.0 148 | down_blocks.1.resnets.0.norm1 149 | down_blocks.1.resnets.0.conv1 150 | down_blocks.1.resnets.0.time_emb_proj 151 | down_blocks.1.resnets.0.norm2 152 | down_blocks.1.resnets.0.dropout 153 | down_blocks.1.resnets.0.conv2 154 | down_blocks.1.resnets.0.conv_shortcut 155 | down_blocks.1.resnets.1 156 | down_blocks.1.resnets.1.norm1 157 | down_blocks.1.resnets.1.conv1 158 | down_blocks.1.resnets.1.time_emb_proj 159 | down_blocks.1.resnets.1.norm2 160 | down_blocks.1.resnets.1.dropout 161 | down_blocks.1.resnets.1.conv2 162 | down_blocks.1.downsamplers 163 | down_blocks.1.downsamplers.0 164 | down_blocks.1.downsamplers.0.conv 165 | down_blocks.2 166 | down_blocks.2.attentions 167 | down_blocks.2.attentions.0 168 | down_blocks.2.attentions.0.norm 169 | down_blocks.2.attentions.0.proj_in 170 | down_blocks.2.attentions.0.transformer_blocks 171 | down_blocks.2.attentions.0.transformer_blocks.0 172 | down_blocks.2.attentions.0.transformer_blocks.0.norm1 173 | down_blocks.2.attentions.0.transformer_blocks.0.attn1 174 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q 175 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k 176 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v 177 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out 178 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0 179 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.1 180 | down_blocks.2.attentions.0.transformer_blocks.0.norm2 181 | down_blocks.2.attentions.0.transformer_blocks.0.attn2 182 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q 183 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k 184 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v 185 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out 186 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0 187 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.1 188 | down_blocks.2.attentions.0.transformer_blocks.0.norm3 189 | down_blocks.2.attentions.0.transformer_blocks.0.ff 190 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net 191 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0 192 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj 193 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.1 194 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2 195 | down_blocks.2.attentions.0.proj_out 196 | down_blocks.2.attentions.1 197 | down_blocks.2.attentions.1.norm 198 | down_blocks.2.attentions.1.proj_in 199 | down_blocks.2.attentions.1.transformer_blocks 200 | down_blocks.2.attentions.1.transformer_blocks.0 201 | down_blocks.2.attentions.1.transformer_blocks.0.norm1 202 | down_blocks.2.attentions.1.transformer_blocks.0.attn1 203 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q 204 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k 205 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v 206 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out 207 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0 208 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.1 209 | down_blocks.2.attentions.1.transformer_blocks.0.norm2 210 | down_blocks.2.attentions.1.transformer_blocks.0.attn2 211 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q 212 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k 213 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v 214 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out 215 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0 216 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.1 217 | down_blocks.2.attentions.1.transformer_blocks.0.norm3 218 | down_blocks.2.attentions.1.transformer_blocks.0.ff 219 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net 220 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0 221 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj 222 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.1 223 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2 224 | down_blocks.2.attentions.1.proj_out 225 | down_blocks.2.resnets 226 | down_blocks.2.resnets.0 227 | down_blocks.2.resnets.0.norm1 228 | down_blocks.2.resnets.0.conv1 229 | down_blocks.2.resnets.0.time_emb_proj 230 | down_blocks.2.resnets.0.norm2 231 | down_blocks.2.resnets.0.dropout 232 | down_blocks.2.resnets.0.conv2 233 | down_blocks.2.resnets.0.conv_shortcut 234 | down_blocks.2.resnets.1 235 | down_blocks.2.resnets.1.norm1 236 | down_blocks.2.resnets.1.conv1 237 | down_blocks.2.resnets.1.time_emb_proj 238 | down_blocks.2.resnets.1.norm2 239 | down_blocks.2.resnets.1.dropout 240 | down_blocks.2.resnets.1.conv2 241 | down_blocks.2.downsamplers 242 | down_blocks.2.downsamplers.0 243 | down_blocks.2.downsamplers.0.conv 244 | down_blocks.3 245 | down_blocks.3.resnets 246 | down_blocks.3.resnets.0 247 | down_blocks.3.resnets.0.norm1 248 | down_blocks.3.resnets.0.conv1 249 | down_blocks.3.resnets.0.time_emb_proj 250 | down_blocks.3.resnets.0.norm2 251 | down_blocks.3.resnets.0.dropout 252 | down_blocks.3.resnets.0.conv2 253 | down_blocks.3.resnets.1 254 | down_blocks.3.resnets.1.norm1 255 | down_blocks.3.resnets.1.conv1 256 | down_blocks.3.resnets.1.time_emb_proj 257 | down_blocks.3.resnets.1.norm2 258 | down_blocks.3.resnets.1.dropout 259 | down_blocks.3.resnets.1.conv2 260 | up_blocks 261 | up_blocks.0 262 | up_blocks.0.resnets 263 | up_blocks.0.resnets.0 264 | up_blocks.0.resnets.0.norm1 265 | up_blocks.0.resnets.0.conv1 266 | up_blocks.0.resnets.0.time_emb_proj 267 | up_blocks.0.resnets.0.norm2 268 | up_blocks.0.resnets.0.dropout 269 | up_blocks.0.resnets.0.conv2 270 | up_blocks.0.resnets.0.conv_shortcut 271 | up_blocks.0.resnets.1 272 | up_blocks.0.resnets.1.norm1 273 | up_blocks.0.resnets.1.conv1 274 | up_blocks.0.resnets.1.time_emb_proj 275 | up_blocks.0.resnets.1.norm2 276 | up_blocks.0.resnets.1.dropout 277 | up_blocks.0.resnets.1.conv2 278 | up_blocks.0.resnets.1.conv_shortcut 279 | up_blocks.0.resnets.2 280 | up_blocks.0.resnets.2.norm1 281 | up_blocks.0.resnets.2.conv1 282 | up_blocks.0.resnets.2.time_emb_proj 283 | up_blocks.0.resnets.2.norm2 284 | up_blocks.0.resnets.2.dropout 285 | up_blocks.0.resnets.2.conv2 286 | up_blocks.0.resnets.2.conv_shortcut 287 | up_blocks.0.upsamplers 288 | up_blocks.0.upsamplers.0 289 | up_blocks.0.upsamplers.0.conv 290 | up_blocks.1 291 | up_blocks.1.attentions 292 | up_blocks.1.attentions.0 293 | up_blocks.1.attentions.0.norm 294 | up_blocks.1.attentions.0.proj_in 295 | up_blocks.1.attentions.0.transformer_blocks 296 | up_blocks.1.attentions.0.transformer_blocks.0 297 | up_blocks.1.attentions.0.transformer_blocks.0.norm1 298 | up_blocks.1.attentions.0.transformer_blocks.0.attn1 299 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q 300 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k 301 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v 302 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out 303 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0 304 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.1 305 | up_blocks.1.attentions.0.transformer_blocks.0.norm2 306 | up_blocks.1.attentions.0.transformer_blocks.0.attn2 307 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q 308 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k 309 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v 310 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out 311 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0 312 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.1 313 | up_blocks.1.attentions.0.transformer_blocks.0.norm3 314 | up_blocks.1.attentions.0.transformer_blocks.0.ff 315 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net 316 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0 317 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj 318 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.1 319 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2 320 | up_blocks.1.attentions.0.proj_out 321 | up_blocks.1.attentions.1 322 | up_blocks.1.attentions.1.norm 323 | up_blocks.1.attentions.1.proj_in 324 | up_blocks.1.attentions.1.transformer_blocks 325 | up_blocks.1.attentions.1.transformer_blocks.0 326 | up_blocks.1.attentions.1.transformer_blocks.0.norm1 327 | up_blocks.1.attentions.1.transformer_blocks.0.attn1 328 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q 329 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k 330 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v 331 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out 332 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0 333 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.1 334 | up_blocks.1.attentions.1.transformer_blocks.0.norm2 335 | up_blocks.1.attentions.1.transformer_blocks.0.attn2 336 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q 337 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k 338 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v 339 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out 340 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0 341 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.1 342 | up_blocks.1.attentions.1.transformer_blocks.0.norm3 343 | up_blocks.1.attentions.1.transformer_blocks.0.ff 344 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net 345 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0 346 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj 347 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.1 348 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2 349 | up_blocks.1.attentions.1.proj_out 350 | up_blocks.1.attentions.2 351 | up_blocks.1.attentions.2.norm 352 | up_blocks.1.attentions.2.proj_in 353 | up_blocks.1.attentions.2.transformer_blocks 354 | up_blocks.1.attentions.2.transformer_blocks.0 355 | up_blocks.1.attentions.2.transformer_blocks.0.norm1 356 | up_blocks.1.attentions.2.transformer_blocks.0.attn1 357 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q 358 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k 359 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v 360 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out 361 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0 362 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.1 363 | up_blocks.1.attentions.2.transformer_blocks.0.norm2 364 | up_blocks.1.attentions.2.transformer_blocks.0.attn2 365 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q 366 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k 367 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v 368 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out 369 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0 370 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.1 371 | up_blocks.1.attentions.2.transformer_blocks.0.norm3 372 | up_blocks.1.attentions.2.transformer_blocks.0.ff 373 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net 374 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0 375 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj 376 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.1 377 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2 378 | up_blocks.1.attentions.2.proj_out 379 | up_blocks.1.resnets 380 | up_blocks.1.resnets.0 381 | up_blocks.1.resnets.0.norm1 382 | up_blocks.1.resnets.0.conv1 383 | up_blocks.1.resnets.0.time_emb_proj 384 | up_blocks.1.resnets.0.norm2 385 | up_blocks.1.resnets.0.dropout 386 | up_blocks.1.resnets.0.conv2 387 | up_blocks.1.resnets.0.conv_shortcut 388 | up_blocks.1.resnets.1 389 | up_blocks.1.resnets.1.norm1 390 | up_blocks.1.resnets.1.conv1 391 | up_blocks.1.resnets.1.time_emb_proj 392 | up_blocks.1.resnets.1.norm2 393 | up_blocks.1.resnets.1.dropout 394 | up_blocks.1.resnets.1.conv2 395 | up_blocks.1.resnets.1.conv_shortcut 396 | up_blocks.1.resnets.2 397 | up_blocks.1.resnets.2.norm1 398 | up_blocks.1.resnets.2.conv1 399 | up_blocks.1.resnets.2.time_emb_proj 400 | up_blocks.1.resnets.2.norm2 401 | up_blocks.1.resnets.2.dropout 402 | up_blocks.1.resnets.2.conv2 403 | up_blocks.1.resnets.2.conv_shortcut 404 | up_blocks.1.upsamplers 405 | up_blocks.1.upsamplers.0 406 | up_blocks.1.upsamplers.0.conv 407 | up_blocks.2 408 | up_blocks.2.attentions 409 | up_blocks.2.attentions.0 410 | up_blocks.2.attentions.0.norm 411 | up_blocks.2.attentions.0.proj_in 412 | up_blocks.2.attentions.0.transformer_blocks 413 | up_blocks.2.attentions.0.transformer_blocks.0 414 | up_blocks.2.attentions.0.transformer_blocks.0.norm1 415 | up_blocks.2.attentions.0.transformer_blocks.0.attn1 416 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q 417 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k 418 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v 419 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out 420 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0 421 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.1 422 | up_blocks.2.attentions.0.transformer_blocks.0.norm2 423 | up_blocks.2.attentions.0.transformer_blocks.0.attn2 424 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q 425 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k 426 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v 427 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out 428 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0 429 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.1 430 | up_blocks.2.attentions.0.transformer_blocks.0.norm3 431 | up_blocks.2.attentions.0.transformer_blocks.0.ff 432 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net 433 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0 434 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj 435 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.1 436 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2 437 | up_blocks.2.attentions.0.proj_out 438 | up_blocks.2.attentions.1 439 | up_blocks.2.attentions.1.norm 440 | up_blocks.2.attentions.1.proj_in 441 | up_blocks.2.attentions.1.transformer_blocks 442 | up_blocks.2.attentions.1.transformer_blocks.0 443 | up_blocks.2.attentions.1.transformer_blocks.0.norm1 444 | up_blocks.2.attentions.1.transformer_blocks.0.attn1 445 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q 446 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k 447 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v 448 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out 449 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0 450 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.1 451 | up_blocks.2.attentions.1.transformer_blocks.0.norm2 452 | up_blocks.2.attentions.1.transformer_blocks.0.attn2 453 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q 454 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k 455 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v 456 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out 457 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0 458 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.1 459 | up_blocks.2.attentions.1.transformer_blocks.0.norm3 460 | up_blocks.2.attentions.1.transformer_blocks.0.ff 461 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net 462 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0 463 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj 464 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.1 465 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2 466 | up_blocks.2.attentions.1.proj_out 467 | up_blocks.2.attentions.2 468 | up_blocks.2.attentions.2.norm 469 | up_blocks.2.attentions.2.proj_in 470 | up_blocks.2.attentions.2.transformer_blocks 471 | up_blocks.2.attentions.2.transformer_blocks.0 472 | up_blocks.2.attentions.2.transformer_blocks.0.norm1 473 | up_blocks.2.attentions.2.transformer_blocks.0.attn1 474 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q 475 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k 476 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v 477 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out 478 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0 479 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.1 480 | up_blocks.2.attentions.2.transformer_blocks.0.norm2 481 | up_blocks.2.attentions.2.transformer_blocks.0.attn2 482 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q 483 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k 484 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v 485 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out 486 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0 487 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.1 488 | up_blocks.2.attentions.2.transformer_blocks.0.norm3 489 | up_blocks.2.attentions.2.transformer_blocks.0.ff 490 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net 491 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0 492 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj 493 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.1 494 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2 495 | up_blocks.2.attentions.2.proj_out 496 | up_blocks.2.resnets 497 | up_blocks.2.resnets.0 498 | up_blocks.2.resnets.0.norm1 499 | up_blocks.2.resnets.0.conv1 500 | up_blocks.2.resnets.0.time_emb_proj 501 | up_blocks.2.resnets.0.norm2 502 | up_blocks.2.resnets.0.dropout 503 | up_blocks.2.resnets.0.conv2 504 | up_blocks.2.resnets.0.conv_shortcut 505 | up_blocks.2.resnets.1 506 | up_blocks.2.resnets.1.norm1 507 | up_blocks.2.resnets.1.conv1 508 | up_blocks.2.resnets.1.time_emb_proj 509 | up_blocks.2.resnets.1.norm2 510 | up_blocks.2.resnets.1.dropout 511 | up_blocks.2.resnets.1.conv2 512 | up_blocks.2.resnets.1.conv_shortcut 513 | up_blocks.2.resnets.2 514 | up_blocks.2.resnets.2.norm1 515 | up_blocks.2.resnets.2.conv1 516 | up_blocks.2.resnets.2.time_emb_proj 517 | up_blocks.2.resnets.2.norm2 518 | up_blocks.2.resnets.2.dropout 519 | up_blocks.2.resnets.2.conv2 520 | up_blocks.2.resnets.2.conv_shortcut 521 | up_blocks.2.upsamplers 522 | up_blocks.2.upsamplers.0 523 | up_blocks.2.upsamplers.0.conv 524 | up_blocks.3 525 | up_blocks.3.attentions 526 | up_blocks.3.attentions.0 527 | up_blocks.3.attentions.0.norm 528 | up_blocks.3.attentions.0.proj_in 529 | up_blocks.3.attentions.0.transformer_blocks 530 | up_blocks.3.attentions.0.transformer_blocks.0 531 | up_blocks.3.attentions.0.transformer_blocks.0.norm1 532 | up_blocks.3.attentions.0.transformer_blocks.0.attn1 533 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q 534 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k 535 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v 536 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out 537 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0 538 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.1 539 | up_blocks.3.attentions.0.transformer_blocks.0.norm2 540 | up_blocks.3.attentions.0.transformer_blocks.0.attn2 541 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q 542 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k 543 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v 544 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out 545 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0 546 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.1 547 | up_blocks.3.attentions.0.transformer_blocks.0.norm3 548 | up_blocks.3.attentions.0.transformer_blocks.0.ff 549 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net 550 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0 551 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj 552 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.1 553 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2 554 | up_blocks.3.attentions.0.proj_out 555 | up_blocks.3.attentions.1 556 | up_blocks.3.attentions.1.norm 557 | up_blocks.3.attentions.1.proj_in 558 | up_blocks.3.attentions.1.transformer_blocks 559 | up_blocks.3.attentions.1.transformer_blocks.0 560 | up_blocks.3.attentions.1.transformer_blocks.0.norm1 561 | up_blocks.3.attentions.1.transformer_blocks.0.attn1 562 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q 563 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k 564 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v 565 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out 566 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0 567 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.1 568 | up_blocks.3.attentions.1.transformer_blocks.0.norm2 569 | up_blocks.3.attentions.1.transformer_blocks.0.attn2 570 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q 571 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k 572 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v 573 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out 574 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0 575 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.1 576 | up_blocks.3.attentions.1.transformer_blocks.0.norm3 577 | up_blocks.3.attentions.1.transformer_blocks.0.ff 578 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net 579 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0 580 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj 581 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.1 582 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2 583 | up_blocks.3.attentions.1.proj_out 584 | up_blocks.3.attentions.2 585 | up_blocks.3.attentions.2.norm 586 | up_blocks.3.attentions.2.proj_in 587 | up_blocks.3.attentions.2.transformer_blocks 588 | up_blocks.3.attentions.2.transformer_blocks.0 589 | up_blocks.3.attentions.2.transformer_blocks.0.norm1 590 | up_blocks.3.attentions.2.transformer_blocks.0.attn1 591 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q 592 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k 593 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v 594 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out 595 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0 596 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.1 597 | up_blocks.3.attentions.2.transformer_blocks.0.norm2 598 | up_blocks.3.attentions.2.transformer_blocks.0.attn2 599 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q 600 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k 601 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v 602 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out 603 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0 604 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.1 605 | up_blocks.3.attentions.2.transformer_blocks.0.norm3 606 | up_blocks.3.attentions.2.transformer_blocks.0.ff 607 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net 608 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0 609 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj 610 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.1 611 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2 612 | up_blocks.3.attentions.2.proj_out 613 | up_blocks.3.resnets 614 | up_blocks.3.resnets.0 615 | up_blocks.3.resnets.0.norm1 616 | up_blocks.3.resnets.0.conv1 617 | up_blocks.3.resnets.0.time_emb_proj 618 | up_blocks.3.resnets.0.norm2 619 | up_blocks.3.resnets.0.dropout 620 | up_blocks.3.resnets.0.conv2 621 | up_blocks.3.resnets.0.conv_shortcut 622 | up_blocks.3.resnets.1 623 | up_blocks.3.resnets.1.norm1 624 | up_blocks.3.resnets.1.conv1 625 | up_blocks.3.resnets.1.time_emb_proj 626 | up_blocks.3.resnets.1.norm2 627 | up_blocks.3.resnets.1.dropout 628 | up_blocks.3.resnets.1.conv2 629 | up_blocks.3.resnets.1.conv_shortcut 630 | up_blocks.3.resnets.2 631 | up_blocks.3.resnets.2.norm1 632 | up_blocks.3.resnets.2.conv1 633 | up_blocks.3.resnets.2.time_emb_proj 634 | up_blocks.3.resnets.2.norm2 635 | up_blocks.3.resnets.2.dropout 636 | up_blocks.3.resnets.2.conv2 637 | up_blocks.3.resnets.2.conv_shortcut 638 | mid_block 639 | mid_block.attentions 640 | mid_block.attentions.0 641 | mid_block.attentions.0.norm 642 | mid_block.attentions.0.proj_in 643 | mid_block.attentions.0.transformer_blocks 644 | mid_block.attentions.0.transformer_blocks.0 645 | mid_block.attentions.0.transformer_blocks.0.norm1 646 | mid_block.attentions.0.transformer_blocks.0.attn1 647 | mid_block.attentions.0.transformer_blocks.0.attn1.to_q 648 | mid_block.attentions.0.transformer_blocks.0.attn1.to_k 649 | mid_block.attentions.0.transformer_blocks.0.attn1.to_v 650 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out 651 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0 652 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out.1 653 | mid_block.attentions.0.transformer_blocks.0.norm2 654 | mid_block.attentions.0.transformer_blocks.0.attn2 655 | mid_block.attentions.0.transformer_blocks.0.attn2.to_q 656 | mid_block.attentions.0.transformer_blocks.0.attn2.to_k 657 | mid_block.attentions.0.transformer_blocks.0.attn2.to_v 658 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out 659 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0 660 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out.1 661 | mid_block.attentions.0.transformer_blocks.0.norm3 662 | mid_block.attentions.0.transformer_blocks.0.ff 663 | mid_block.attentions.0.transformer_blocks.0.ff.net 664 | mid_block.attentions.0.transformer_blocks.0.ff.net.0 665 | mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj 666 | mid_block.attentions.0.transformer_blocks.0.ff.net.1 667 | mid_block.attentions.0.transformer_blocks.0.ff.net.2 668 | mid_block.attentions.0.proj_out 669 | mid_block.resnets 670 | mid_block.resnets.0 671 | mid_block.resnets.0.norm1 672 | mid_block.resnets.0.conv1 673 | mid_block.resnets.0.time_emb_proj 674 | mid_block.resnets.0.norm2 675 | mid_block.resnets.0.dropout 676 | mid_block.resnets.0.conv2 677 | mid_block.resnets.1 678 | mid_block.resnets.1.norm1 679 | mid_block.resnets.1.conv1 680 | mid_block.resnets.1.time_emb_proj 681 | mid_block.resnets.1.norm2 682 | mid_block.resnets.1.dropout 683 | mid_block.resnets.1.conv2 684 | conv_norm_out 685 | conv_out 686 | -------------------------------------------------------------------------------- /hidiffusion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def isinstance_str(x: object, cls_name: str): 5 | """ 6 | Checks whether x has any class *named* cls_name in its ancestry. 7 | Doesn't require access to the class's implementation. 8 | 9 | Useful for patching! 10 | """ 11 | 12 | for _cls in x.__class__.__mro__: 13 | if _cls.__name__ == cls_name: 14 | return True 15 | 16 | return False 17 | 18 | 19 | def init_generator(device: torch.device, fallback: torch.Generator=None): 20 | """ 21 | Forks the current default random generator given device. 22 | """ 23 | if device.type == "cpu": 24 | return torch.Generator(device="cpu").set_state(torch.get_rng_state()) 25 | elif device.type == "cuda": 26 | return torch.Generator(device=device).set_state(torch.cuda.get_rng_state()) 27 | else: 28 | if fallback is None: 29 | return init_generator(torch.device("cpu")) 30 | else: 31 | return fallback 32 | -------------------------------------------------------------------------------- /ip_adapter/__init__.py: -------------------------------------------------------------------------------- 1 | from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull 2 | 3 | __all__ = [ 4 | "IPAdapter", 5 | "IPAdapterPlus", 6 | "IPAdapterPlusXL", 7 | "IPAdapterXL", 8 | "IPAdapterFull", 9 | ] 10 | -------------------------------------------------------------------------------- /ip_adapter/attention_processor.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class AttnProcessor(nn.Module): 8 | r""" 9 | Default processor for performing attention-related computations. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | hidden_size=None, 15 | cross_attention_dim=None, 16 | ): 17 | super().__init__() 18 | 19 | def __call__( 20 | self, 21 | attn, 22 | hidden_states, 23 | encoder_hidden_states=None, 24 | attention_mask=None, 25 | temb=None, 26 | ): 27 | residual = hidden_states 28 | 29 | if attn.spatial_norm is not None: 30 | hidden_states = attn.spatial_norm(hidden_states, temb) 31 | 32 | input_ndim = hidden_states.ndim 33 | 34 | if input_ndim == 4: 35 | batch_size, channel, height, width = hidden_states.shape 36 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 37 | 38 | batch_size, sequence_length, _ = ( 39 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 40 | ) 41 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 42 | 43 | if attn.group_norm is not None: 44 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 45 | 46 | query = attn.to_q(hidden_states) 47 | 48 | if encoder_hidden_states is None: 49 | encoder_hidden_states = hidden_states 50 | elif attn.norm_cross: 51 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 52 | 53 | key = attn.to_k(encoder_hidden_states) 54 | value = attn.to_v(encoder_hidden_states) 55 | 56 | query = attn.head_to_batch_dim(query) 57 | key = attn.head_to_batch_dim(key) 58 | value = attn.head_to_batch_dim(value) 59 | 60 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 61 | hidden_states = torch.bmm(attention_probs, value) 62 | hidden_states = attn.batch_to_head_dim(hidden_states) 63 | 64 | # linear proj 65 | hidden_states = attn.to_out[0](hidden_states) 66 | # dropout 67 | hidden_states = attn.to_out[1](hidden_states) 68 | 69 | if input_ndim == 4: 70 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 71 | 72 | if attn.residual_connection: 73 | hidden_states = hidden_states + residual 74 | 75 | hidden_states = hidden_states / attn.rescale_output_factor 76 | 77 | return hidden_states 78 | 79 | 80 | class IPAttnProcessor(nn.Module): 81 | r""" 82 | Attention processor for IP-Adapater. 83 | Args: 84 | hidden_size (`int`): 85 | The hidden size of the attention layer. 86 | cross_attention_dim (`int`): 87 | The number of channels in the `encoder_hidden_states`. 88 | scale (`float`, defaults to 1.0): 89 | the weight scale of image prompt. 90 | num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16): 91 | The context length of the image features. 92 | """ 93 | 94 | def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False): 95 | super().__init__() 96 | 97 | self.hidden_size = hidden_size 98 | self.cross_attention_dim = cross_attention_dim 99 | self.scale = scale 100 | self.num_tokens = num_tokens 101 | self.skip = skip 102 | 103 | self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) 104 | self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) 105 | 106 | def __call__( 107 | self, 108 | attn, 109 | hidden_states, 110 | encoder_hidden_states=None, 111 | attention_mask=None, 112 | temb=None, 113 | ): 114 | residual = hidden_states 115 | 116 | if attn.spatial_norm is not None: 117 | hidden_states = attn.spatial_norm(hidden_states, temb) 118 | 119 | input_ndim = hidden_states.ndim 120 | 121 | if input_ndim == 4: 122 | batch_size, channel, height, width = hidden_states.shape 123 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 124 | 125 | batch_size, sequence_length, _ = ( 126 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 127 | ) 128 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 129 | 130 | if attn.group_norm is not None: 131 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 132 | 133 | query = attn.to_q(hidden_states) 134 | 135 | if encoder_hidden_states is None: 136 | encoder_hidden_states = hidden_states 137 | else: 138 | # get encoder_hidden_states, ip_hidden_states 139 | end_pos = encoder_hidden_states.shape[1] - self.num_tokens 140 | encoder_hidden_states, ip_hidden_states = ( 141 | encoder_hidden_states[:, :end_pos, :], 142 | encoder_hidden_states[:, end_pos:, :], 143 | ) 144 | if attn.norm_cross: 145 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 146 | 147 | key = attn.to_k(encoder_hidden_states) 148 | value = attn.to_v(encoder_hidden_states) 149 | 150 | query = attn.head_to_batch_dim(query) 151 | key = attn.head_to_batch_dim(key) 152 | value = attn.head_to_batch_dim(value) 153 | 154 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 155 | hidden_states = torch.bmm(attention_probs, value) 156 | hidden_states = attn.batch_to_head_dim(hidden_states) 157 | 158 | if not self.skip: 159 | # for ip-adapter 160 | ip_key = self.to_k_ip(ip_hidden_states) 161 | ip_value = self.to_v_ip(ip_hidden_states) 162 | 163 | ip_key = attn.head_to_batch_dim(ip_key) 164 | ip_value = attn.head_to_batch_dim(ip_value) 165 | 166 | ip_attention_probs = attn.get_attention_scores(query, ip_key, None) 167 | self.attn_map = ip_attention_probs 168 | ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) 169 | ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states) 170 | 171 | hidden_states = hidden_states + self.scale * ip_hidden_states 172 | 173 | # linear proj 174 | hidden_states = attn.to_out[0](hidden_states) 175 | # dropout 176 | hidden_states = attn.to_out[1](hidden_states) 177 | 178 | if input_ndim == 4: 179 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 180 | 181 | if attn.residual_connection: 182 | hidden_states = hidden_states + residual 183 | 184 | hidden_states = hidden_states / attn.rescale_output_factor 185 | 186 | return hidden_states 187 | 188 | 189 | class AttnProcessor2_0(torch.nn.Module): 190 | r""" 191 | Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). 192 | """ 193 | 194 | def __init__( 195 | self, 196 | hidden_size=None, 197 | cross_attention_dim=None, 198 | ): 199 | super().__init__() 200 | if not hasattr(F, "scaled_dot_product_attention"): 201 | raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") 202 | 203 | def __call__( 204 | self, 205 | attn, 206 | hidden_states, 207 | encoder_hidden_states=None, 208 | attention_mask=None, 209 | temb=None, 210 | ): 211 | residual = hidden_states 212 | 213 | if attn.spatial_norm is not None: 214 | hidden_states = attn.spatial_norm(hidden_states, temb) 215 | 216 | input_ndim = hidden_states.ndim 217 | 218 | if input_ndim == 4: 219 | batch_size, channel, height, width = hidden_states.shape 220 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 221 | 222 | batch_size, sequence_length, _ = ( 223 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 224 | ) 225 | 226 | if attention_mask is not None: 227 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 228 | # scaled_dot_product_attention expects attention_mask shape to be 229 | # (batch, heads, source_length, target_length) 230 | attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) 231 | 232 | if attn.group_norm is not None: 233 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 234 | 235 | query = attn.to_q(hidden_states) 236 | 237 | if encoder_hidden_states is None: 238 | encoder_hidden_states = hidden_states 239 | elif attn.norm_cross: 240 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 241 | 242 | key = attn.to_k(encoder_hidden_states) 243 | value = attn.to_v(encoder_hidden_states) 244 | 245 | inner_dim = key.shape[-1] 246 | head_dim = inner_dim // attn.heads 247 | 248 | query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 249 | 250 | key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 251 | value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 252 | 253 | # the output of sdp = (batch, num_heads, seq_len, head_dim) 254 | # TODO: add support for attn.scale when we move to Torch 2.1 255 | hidden_states = F.scaled_dot_product_attention( 256 | query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False 257 | ) 258 | 259 | hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) 260 | hidden_states = hidden_states.to(query.dtype) 261 | 262 | # linear proj 263 | hidden_states = attn.to_out[0](hidden_states) 264 | # dropout 265 | hidden_states = attn.to_out[1](hidden_states) 266 | 267 | if input_ndim == 4: 268 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 269 | 270 | if attn.residual_connection: 271 | hidden_states = hidden_states + residual 272 | 273 | hidden_states = hidden_states / attn.rescale_output_factor 274 | 275 | return hidden_states 276 | 277 | 278 | class IPAttnProcessor2_0(torch.nn.Module): 279 | r""" 280 | Attention processor for IP-Adapater for PyTorch 2.0. 281 | Args: 282 | hidden_size (`int`): 283 | The hidden size of the attention layer. 284 | cross_attention_dim (`int`): 285 | The number of channels in the `encoder_hidden_states`. 286 | scale (`float`, defaults to 1.0): 287 | the weight scale of image prompt. 288 | num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16): 289 | The context length of the image features. 290 | """ 291 | 292 | def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False): 293 | super().__init__() 294 | 295 | if not hasattr(F, "scaled_dot_product_attention"): 296 | raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") 297 | 298 | self.hidden_size = hidden_size 299 | self.cross_attention_dim = cross_attention_dim 300 | self.scale = scale 301 | self.num_tokens = num_tokens 302 | self.skip = skip 303 | 304 | self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) 305 | self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) 306 | 307 | def __call__( 308 | self, 309 | attn, 310 | hidden_states, 311 | encoder_hidden_states=None, 312 | attention_mask=None, 313 | temb=None, 314 | ): 315 | residual = hidden_states 316 | 317 | if attn.spatial_norm is not None: 318 | hidden_states = attn.spatial_norm(hidden_states, temb) 319 | 320 | input_ndim = hidden_states.ndim 321 | 322 | if input_ndim == 4: 323 | batch_size, channel, height, width = hidden_states.shape 324 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 325 | 326 | batch_size, sequence_length, _ = ( 327 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 328 | ) 329 | 330 | if attention_mask is not None: 331 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 332 | # scaled_dot_product_attention expects attention_mask shape to be 333 | # (batch, heads, source_length, target_length) 334 | attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) 335 | 336 | if attn.group_norm is not None: 337 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 338 | 339 | query = attn.to_q(hidden_states) 340 | 341 | if encoder_hidden_states is None: 342 | encoder_hidden_states = hidden_states 343 | else: 344 | # get encoder_hidden_states, ip_hidden_states 345 | end_pos = encoder_hidden_states.shape[1] - self.num_tokens 346 | encoder_hidden_states, ip_hidden_states = ( 347 | encoder_hidden_states[:, :end_pos, :], 348 | encoder_hidden_states[:, end_pos:, :], 349 | ) 350 | if attn.norm_cross: 351 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 352 | 353 | key = attn.to_k(encoder_hidden_states) 354 | value = attn.to_v(encoder_hidden_states) 355 | 356 | inner_dim = key.shape[-1] 357 | head_dim = inner_dim // attn.heads 358 | 359 | query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 360 | 361 | key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 362 | value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 363 | 364 | # the output of sdp = (batch, num_heads, seq_len, head_dim) 365 | # TODO: add support for attn.scale when we move to Torch 2.1 366 | hidden_states = F.scaled_dot_product_attention( 367 | query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False 368 | ) 369 | 370 | hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) 371 | hidden_states = hidden_states.to(query.dtype) 372 | 373 | if not self.skip: 374 | # for ip-adapter 375 | ip_key = self.to_k_ip(ip_hidden_states) 376 | ip_value = self.to_v_ip(ip_hidden_states) 377 | 378 | ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 379 | ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 380 | 381 | # the output of sdp = (batch, num_heads, seq_len, head_dim) 382 | # TODO: add support for attn.scale when we move to Torch 2.1 383 | ip_hidden_states = F.scaled_dot_product_attention( 384 | query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False 385 | ) 386 | with torch.no_grad(): 387 | self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1) 388 | #print(self.attn_map.shape) 389 | 390 | ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) 391 | ip_hidden_states = ip_hidden_states.to(query.dtype) 392 | 393 | hidden_states = hidden_states + self.scale * ip_hidden_states 394 | 395 | # linear proj 396 | hidden_states = attn.to_out[0](hidden_states) 397 | # dropout 398 | hidden_states = attn.to_out[1](hidden_states) 399 | 400 | if input_ndim == 4: 401 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 402 | 403 | if attn.residual_connection: 404 | hidden_states = hidden_states + residual 405 | 406 | hidden_states = hidden_states / attn.rescale_output_factor 407 | 408 | return hidden_states 409 | 410 | 411 | ## for controlnet 412 | class CNAttnProcessor: 413 | r""" 414 | Default processor for performing attention-related computations. 415 | """ 416 | 417 | def __init__(self, num_tokens=4): 418 | self.num_tokens = num_tokens 419 | 420 | def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None): 421 | residual = hidden_states 422 | 423 | if attn.spatial_norm is not None: 424 | hidden_states = attn.spatial_norm(hidden_states, temb) 425 | 426 | input_ndim = hidden_states.ndim 427 | 428 | if input_ndim == 4: 429 | batch_size, channel, height, width = hidden_states.shape 430 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 431 | 432 | batch_size, sequence_length, _ = ( 433 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 434 | ) 435 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 436 | 437 | if attn.group_norm is not None: 438 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 439 | 440 | query = attn.to_q(hidden_states) 441 | 442 | if encoder_hidden_states is None: 443 | encoder_hidden_states = hidden_states 444 | else: 445 | end_pos = encoder_hidden_states.shape[1] - self.num_tokens 446 | encoder_hidden_states = encoder_hidden_states[:, :end_pos] # only use text 447 | if attn.norm_cross: 448 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 449 | 450 | key = attn.to_k(encoder_hidden_states) 451 | value = attn.to_v(encoder_hidden_states) 452 | 453 | query = attn.head_to_batch_dim(query) 454 | key = attn.head_to_batch_dim(key) 455 | value = attn.head_to_batch_dim(value) 456 | 457 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 458 | hidden_states = torch.bmm(attention_probs, value) 459 | hidden_states = attn.batch_to_head_dim(hidden_states) 460 | 461 | # linear proj 462 | hidden_states = attn.to_out[0](hidden_states) 463 | # dropout 464 | hidden_states = attn.to_out[1](hidden_states) 465 | 466 | if input_ndim == 4: 467 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 468 | 469 | if attn.residual_connection: 470 | hidden_states = hidden_states + residual 471 | 472 | hidden_states = hidden_states / attn.rescale_output_factor 473 | 474 | return hidden_states 475 | 476 | 477 | class CNAttnProcessor2_0: 478 | r""" 479 | Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). 480 | """ 481 | 482 | def __init__(self, num_tokens=4): 483 | if not hasattr(F, "scaled_dot_product_attention"): 484 | raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") 485 | self.num_tokens = num_tokens 486 | 487 | def __call__( 488 | self, 489 | attn, 490 | hidden_states, 491 | encoder_hidden_states=None, 492 | attention_mask=None, 493 | temb=None, 494 | ): 495 | residual = hidden_states 496 | 497 | if attn.spatial_norm is not None: 498 | hidden_states = attn.spatial_norm(hidden_states, temb) 499 | 500 | input_ndim = hidden_states.ndim 501 | 502 | if input_ndim == 4: 503 | batch_size, channel, height, width = hidden_states.shape 504 | hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) 505 | 506 | batch_size, sequence_length, _ = ( 507 | hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape 508 | ) 509 | 510 | if attention_mask is not None: 511 | attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) 512 | # scaled_dot_product_attention expects attention_mask shape to be 513 | # (batch, heads, source_length, target_length) 514 | attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) 515 | 516 | if attn.group_norm is not None: 517 | hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 518 | 519 | query = attn.to_q(hidden_states) 520 | 521 | if encoder_hidden_states is None: 522 | encoder_hidden_states = hidden_states 523 | else: 524 | end_pos = encoder_hidden_states.shape[1] - self.num_tokens 525 | encoder_hidden_states = encoder_hidden_states[:, :end_pos] # only use text 526 | if attn.norm_cross: 527 | encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) 528 | 529 | key = attn.to_k(encoder_hidden_states) 530 | value = attn.to_v(encoder_hidden_states) 531 | 532 | inner_dim = key.shape[-1] 533 | head_dim = inner_dim // attn.heads 534 | 535 | query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 536 | 537 | key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 538 | value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 539 | 540 | # the output of sdp = (batch, num_heads, seq_len, head_dim) 541 | # TODO: add support for attn.scale when we move to Torch 2.1 542 | hidden_states = F.scaled_dot_product_attention( 543 | query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False 544 | ) 545 | 546 | hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) 547 | hidden_states = hidden_states.to(query.dtype) 548 | 549 | # linear proj 550 | hidden_states = attn.to_out[0](hidden_states) 551 | # dropout 552 | hidden_states = attn.to_out[1](hidden_states) 553 | 554 | if input_ndim == 4: 555 | hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) 556 | 557 | if attn.residual_connection: 558 | hidden_states = hidden_states + residual 559 | 560 | hidden_states = hidden_states / attn.rescale_output_factor 561 | 562 | return hidden_states 563 | -------------------------------------------------------------------------------- /ip_adapter/ip_adapter.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | import torch 5 | from diffusers import StableDiffusionPipeline 6 | from diffusers.pipelines.controlnet import MultiControlNetModel 7 | from PIL import Image 8 | from safetensors import safe_open 9 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection 10 | 11 | from .utils import is_torch2_available, get_generator 12 | from comfy.model_management import cleanup_models 13 | if is_torch2_available(): 14 | from .attention_processor import ( 15 | AttnProcessor2_0 as AttnProcessor, 16 | ) 17 | from .attention_processor import ( 18 | CNAttnProcessor2_0 as CNAttnProcessor, 19 | ) 20 | from .attention_processor import ( 21 | IPAttnProcessor2_0 as IPAttnProcessor, 22 | ) 23 | else: 24 | from .attention_processor import AttnProcessor, CNAttnProcessor, IPAttnProcessor 25 | from .resampler import Resampler 26 | 27 | 28 | class ImageProjModel(torch.nn.Module): 29 | """Projection Model""" 30 | 31 | def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4): 32 | super().__init__() 33 | 34 | self.generator = None 35 | self.cross_attention_dim = cross_attention_dim 36 | self.clip_extra_context_tokens = clip_extra_context_tokens 37 | self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim) 38 | self.norm = torch.nn.LayerNorm(cross_attention_dim) 39 | 40 | def forward(self, image_embeds): 41 | embeds = image_embeds 42 | clip_extra_context_tokens = self.proj(embeds).reshape( 43 | -1, self.clip_extra_context_tokens, self.cross_attention_dim 44 | ) 45 | clip_extra_context_tokens = self.norm(clip_extra_context_tokens) 46 | return clip_extra_context_tokens 47 | 48 | 49 | class MLPProjModel(torch.nn.Module): 50 | """SD model with image prompt""" 51 | def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024): 52 | super().__init__() 53 | 54 | self.proj = torch.nn.Sequential( 55 | torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim), 56 | torch.nn.GELU(), 57 | torch.nn.Linear(clip_embeddings_dim, cross_attention_dim), 58 | torch.nn.LayerNorm(cross_attention_dim) 59 | ) 60 | 61 | def forward(self, image_embeds): 62 | clip_extra_context_tokens = self.proj(image_embeds) 63 | return clip_extra_context_tokens 64 | 65 | 66 | class IPAdapter: 67 | def __init__(self, sd_pipe, image_encoder, ip_ckpt, device,image_encoder_config, num_tokens=4, target_blocks=["block"]): 68 | self.device = device 69 | # self.image_encoder_path = image_encoder_path 70 | self.ip_ckpt = ip_ckpt 71 | self.num_tokens = num_tokens 72 | self.target_blocks = target_blocks 73 | self.image_encoder_config=image_encoder_config 74 | self.pipe = sd_pipe.to(self.device) 75 | self.set_ip_adapter() 76 | 77 | # load image encoder 78 | # self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to( 79 | # self.device, dtype=torch.float16 80 | # ) 81 | self.image_encoder = image_encoder.encode_image 82 | 83 | self.clip_image_processor = CLIPImageProcessor() 84 | # image proj model 85 | self.image_proj_model = self.init_proj() 86 | 87 | self.load_ip_adapter() 88 | 89 | def init_proj(self): 90 | image_proj_model = ImageProjModel( 91 | cross_attention_dim=self.pipe.unet.config.cross_attention_dim, 92 | clip_embeddings_dim=self.image_encoder_config["projection_dim"], 93 | clip_extra_context_tokens=self.num_tokens, 94 | ).to(self.device, dtype=torch.float16) 95 | return image_proj_model 96 | 97 | def set_ip_adapter(self): 98 | unet = self.pipe.unet 99 | attn_procs = {} 100 | for name in unet.attn_processors.keys(): 101 | cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim 102 | if name.startswith("mid_block"): 103 | hidden_size = unet.config.block_out_channels[-1] 104 | elif name.startswith("up_blocks"): 105 | block_id = int(name[len("up_blocks.")]) 106 | hidden_size = list(reversed(unet.config.block_out_channels))[block_id] 107 | elif name.startswith("down_blocks"): 108 | block_id = int(name[len("down_blocks.")]) 109 | hidden_size = unet.config.block_out_channels[block_id] 110 | if cross_attention_dim is None: 111 | attn_procs[name] = AttnProcessor() 112 | else: 113 | selected = False 114 | for block_name in self.target_blocks: 115 | if block_name in name: 116 | selected = True 117 | break 118 | if selected: 119 | attn_procs[name] = IPAttnProcessor( 120 | hidden_size=hidden_size, 121 | cross_attention_dim=cross_attention_dim, 122 | scale=1.0, 123 | num_tokens=self.num_tokens, 124 | ).to(self.device, dtype=torch.float16) 125 | else: 126 | attn_procs[name] = IPAttnProcessor( 127 | hidden_size=hidden_size, 128 | cross_attention_dim=cross_attention_dim, 129 | scale=1.0, 130 | num_tokens=self.num_tokens, 131 | skip=True 132 | ).to(self.device, dtype=torch.float16) 133 | unet.set_attn_processor(attn_procs) 134 | if hasattr(self.pipe, "controlnet"): 135 | if isinstance(self.pipe.controlnet, MultiControlNetModel): 136 | for controlnet in self.pipe.controlnet.nets: 137 | controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens)) 138 | else: 139 | self.pipe.controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens)) 140 | 141 | def load_ip_adapter(self): 142 | if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors": 143 | state_dict = {"image_proj": {}, "ip_adapter": {}} 144 | with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f: 145 | for key in f.keys(): 146 | if key.startswith("image_proj."): 147 | state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key) 148 | elif key.startswith("ip_adapter."): 149 | state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key) 150 | else: 151 | state_dict = torch.load(self.ip_ckpt, map_location="cpu") 152 | self.image_proj_model.load_state_dict(state_dict["image_proj"]) 153 | ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values()) 154 | ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False) 155 | 156 | @torch.inference_mode() 157 | def get_image_embeds(self, pil_image=None, clip_image_embeds=None, content_prompt_embeds=None): 158 | 159 | if isinstance(pil_image, torch.Tensor): 160 | 161 | clip_image_embeds = self.image_encoder(pil_image)["image_embeds"] 162 | clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16) 163 | del self.image_encoder 164 | cleanup_models(keep_clone_weights_loaded=False) 165 | else: 166 | clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16) 167 | clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16) 168 | if content_prompt_embeds is not None: 169 | clip_image_embeds = clip_image_embeds - content_prompt_embeds 170 | clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16) 171 | image_prompt_embeds = self.image_proj_model(clip_image_embeds) 172 | uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds)) 173 | return image_prompt_embeds, uncond_image_prompt_embeds 174 | 175 | def set_scale(self, scale): 176 | for attn_processor in self.pipe.unet.attn_processors.values(): 177 | if isinstance(attn_processor, IPAttnProcessor): 178 | attn_processor.scale = scale 179 | 180 | def generate( 181 | self, 182 | pil_image=None, 183 | clip_image_embeds=None, 184 | prompt=None, 185 | negative_prompt=None, 186 | scale=1.0, 187 | num_samples=4, 188 | seed=None, 189 | guidance_scale=7.5, 190 | num_inference_steps=30, 191 | neg_content_emb=None, 192 | **kwargs, 193 | ): 194 | self.set_scale(scale) 195 | d1, _, _, _ = pil_image.size() 196 | if isinstance(pil_image,torch.Tensor) : 197 | num_prompts = d1 198 | else: 199 | num_prompts = clip_image_embeds.size(0) 200 | 201 | if prompt is None: 202 | prompt = "best quality, high quality" 203 | if negative_prompt is None: 204 | negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" 205 | 206 | if not isinstance(prompt, List): 207 | prompt = [prompt] * num_prompts 208 | if not isinstance(negative_prompt, List): 209 | negative_prompt = [negative_prompt] * num_prompts 210 | 211 | image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds( 212 | pil_image=pil_image, clip_image_embeds=clip_image_embeds, content_prompt_embeds=neg_content_emb 213 | ) 214 | bs_embed, seq_len, _ = image_prompt_embeds.shape 215 | image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1) 216 | image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 217 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1) 218 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 219 | 220 | with torch.inference_mode(): 221 | prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt( 222 | prompt, 223 | device=self.device, 224 | num_images_per_prompt=num_samples, 225 | do_classifier_free_guidance=True, 226 | negative_prompt=negative_prompt, 227 | ) 228 | prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1) 229 | negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1) 230 | 231 | generator = get_generator(seed, self.device) 232 | 233 | images = self.pipe( 234 | prompt_embeds=prompt_embeds, 235 | negative_prompt_embeds=negative_prompt_embeds, 236 | guidance_scale=guidance_scale, 237 | num_inference_steps=num_inference_steps, 238 | generator=generator, 239 | **kwargs, 240 | ).images 241 | 242 | return images 243 | 244 | 245 | class IPAdapterXL(IPAdapter): 246 | """SDXL""" 247 | 248 | def generate( 249 | self, 250 | pil_image, 251 | prompt=None, 252 | negative_prompt=None, 253 | scale=1.0, 254 | num_samples=4, 255 | seed=None, 256 | num_inference_steps=30, 257 | neg_content_emb=None, 258 | neg_content_prompt=None, 259 | neg_content_scale=1.0, 260 | **kwargs, 261 | ): 262 | self.set_scale(scale) 263 | d1,_,_,_=pil_image.size() 264 | num_prompts = d1 265 | 266 | if prompt is None: 267 | prompt = "best quality, high quality" 268 | if negative_prompt is None: 269 | negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" 270 | 271 | if not isinstance(prompt, List): 272 | prompt = [prompt] * num_prompts 273 | if not isinstance(negative_prompt, List): 274 | negative_prompt = [negative_prompt] * num_prompts 275 | 276 | if neg_content_emb is None: 277 | if neg_content_prompt is not None: 278 | with torch.inference_mode(): 279 | ( 280 | prompt_embeds_, # torch.Size([1, 77, 2048]) 281 | negative_prompt_embeds_, 282 | pooled_prompt_embeds_, # torch.Size([1, 1280]) 283 | negative_pooled_prompt_embeds_, 284 | ) = self.pipe.encode_prompt( 285 | neg_content_prompt, 286 | num_images_per_prompt=num_samples, 287 | do_classifier_free_guidance=True, 288 | negative_prompt=negative_prompt, 289 | ) 290 | pooled_prompt_embeds_ *= neg_content_scale 291 | else: 292 | pooled_prompt_embeds_ = neg_content_emb 293 | else: 294 | pooled_prompt_embeds_ = None 295 | 296 | image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image, content_prompt_embeds=pooled_prompt_embeds_) 297 | bs_embed, seq_len, _ = image_prompt_embeds.shape 298 | image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1) 299 | image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 300 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1) 301 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 302 | 303 | with torch.inference_mode(): 304 | ( 305 | prompt_embeds, 306 | negative_prompt_embeds, 307 | pooled_prompt_embeds, 308 | negative_pooled_prompt_embeds, 309 | ) = self.pipe.encode_prompt( 310 | prompt, 311 | num_images_per_prompt=num_samples, 312 | do_classifier_free_guidance=True, 313 | negative_prompt=negative_prompt, 314 | ) 315 | prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1) 316 | negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1) 317 | 318 | self.generator = get_generator(seed, self.device) 319 | 320 | cleanup_models(keep_clone_weights_loaded=False) 321 | images = self.pipe( 322 | prompt_embeds=prompt_embeds, 323 | negative_prompt_embeds=negative_prompt_embeds, 324 | pooled_prompt_embeds=pooled_prompt_embeds, 325 | negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, 326 | num_inference_steps=num_inference_steps, 327 | generator=self.generator, 328 | **kwargs, 329 | ).images 330 | 331 | return images 332 | 333 | 334 | class IPAdapterPlus(IPAdapter): 335 | """IP-Adapter with fine-grained features""" 336 | 337 | def init_proj(self): 338 | image_proj_model = Resampler( 339 | dim=self.pipe.unet.config.cross_attention_dim, 340 | depth=4, 341 | dim_head=64, 342 | heads=12, 343 | num_queries=self.num_tokens, 344 | embedding_dim=self.image_encoder.config.hidden_size, 345 | output_dim=self.pipe.unet.config.cross_attention_dim, 346 | ff_mult=4, 347 | ).to(self.device, dtype=torch.float16) 348 | return image_proj_model 349 | 350 | @torch.inference_mode() 351 | def get_image_embeds(self, pil_image=None, clip_image_embeds=None): 352 | if isinstance(pil_image, Image.Image): 353 | pil_image = [pil_image] 354 | clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values 355 | clip_image = clip_image.to(self.device, dtype=torch.float16) 356 | clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2] 357 | image_prompt_embeds = self.image_proj_model(clip_image_embeds) 358 | uncond_clip_image_embeds = self.image_encoder( 359 | torch.zeros_like(clip_image), output_hidden_states=True 360 | ).hidden_states[-2] 361 | uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds) 362 | return image_prompt_embeds, uncond_image_prompt_embeds 363 | 364 | 365 | class IPAdapterFull(IPAdapterPlus): 366 | """IP-Adapter with full features""" 367 | 368 | def init_proj(self): 369 | image_proj_model = MLPProjModel( 370 | cross_attention_dim=self.pipe.unet.config.cross_attention_dim, 371 | clip_embeddings_dim=self.image_encoder.config.hidden_size, 372 | ).to(self.device, dtype=torch.float16) 373 | return image_proj_model 374 | 375 | 376 | class IPAdapterPlusXL(IPAdapter): 377 | """SDXL""" 378 | 379 | def init_proj(self): 380 | image_proj_model = Resampler( 381 | dim=1280, 382 | depth=4, 383 | dim_head=64, 384 | heads=20, 385 | num_queries=self.num_tokens, 386 | embedding_dim=self.image_encoder.config.hidden_size, 387 | output_dim=self.pipe.unet.config.cross_attention_dim, 388 | ff_mult=4, 389 | ).to(self.device, dtype=torch.float16) 390 | return image_proj_model 391 | 392 | @torch.inference_mode() 393 | def get_image_embeds(self, pil_image): 394 | if isinstance(pil_image, Image.Image): 395 | pil_image = [pil_image] 396 | clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values 397 | clip_image = clip_image.to(self.device, dtype=torch.float16) 398 | clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2] 399 | image_prompt_embeds = self.image_proj_model(clip_image_embeds) 400 | uncond_clip_image_embeds = self.image_encoder( 401 | torch.zeros_like(clip_image), output_hidden_states=True 402 | ).hidden_states[-2] 403 | uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds) 404 | return image_prompt_embeds, uncond_image_prompt_embeds 405 | 406 | def generate( 407 | self, 408 | pil_image, 409 | prompt=None, 410 | negative_prompt=None, 411 | scale=1.0, 412 | num_samples=4, 413 | seed=None, 414 | num_inference_steps=30, 415 | **kwargs, 416 | ): 417 | self.set_scale(scale) 418 | 419 | num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image) 420 | 421 | if prompt is None: 422 | prompt = "best quality, high quality" 423 | if negative_prompt is None: 424 | negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" 425 | 426 | if not isinstance(prompt, List): 427 | prompt = [prompt] * num_prompts 428 | if not isinstance(negative_prompt, List): 429 | negative_prompt = [negative_prompt] * num_prompts 430 | 431 | image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image) 432 | bs_embed, seq_len, _ = image_prompt_embeds.shape 433 | image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1) 434 | image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 435 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1) 436 | uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1) 437 | 438 | with torch.inference_mode(): 439 | ( 440 | prompt_embeds, 441 | negative_prompt_embeds, 442 | pooled_prompt_embeds, 443 | negative_pooled_prompt_embeds, 444 | ) = self.pipe.encode_prompt( 445 | prompt, 446 | num_images_per_prompt=num_samples, 447 | do_classifier_free_guidance=True, 448 | negative_prompt=negative_prompt, 449 | ) 450 | prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1) 451 | negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1) 452 | 453 | generator = get_generator(seed, self.device) 454 | 455 | images = self.pipe( 456 | prompt_embeds=prompt_embeds, 457 | negative_prompt_embeds=negative_prompt_embeds, 458 | pooled_prompt_embeds=pooled_prompt_embeds, 459 | negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, 460 | num_inference_steps=num_inference_steps, 461 | generator=generator, 462 | **kwargs, 463 | ).images 464 | 465 | return images 466 | -------------------------------------------------------------------------------- /ip_adapter/resampler.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py 2 | # and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py 3 | 4 | import math 5 | 6 | import torch 7 | import torch.nn as nn 8 | from einops import rearrange 9 | from einops.layers.torch import Rearrange 10 | 11 | 12 | # FFN 13 | def FeedForward(dim, mult=4): 14 | inner_dim = int(dim * mult) 15 | return nn.Sequential( 16 | nn.LayerNorm(dim), 17 | nn.Linear(dim, inner_dim, bias=False), 18 | nn.GELU(), 19 | nn.Linear(inner_dim, dim, bias=False), 20 | ) 21 | 22 | 23 | def reshape_tensor(x, heads): 24 | bs, length, width = x.shape 25 | # (bs, length, width) --> (bs, length, n_heads, dim_per_head) 26 | x = x.view(bs, length, heads, -1) 27 | # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) 28 | x = x.transpose(1, 2) 29 | # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) 30 | x = x.reshape(bs, heads, length, -1) 31 | return x 32 | 33 | 34 | class PerceiverAttention(nn.Module): 35 | def __init__(self, *, dim, dim_head=64, heads=8): 36 | super().__init__() 37 | self.scale = dim_head**-0.5 38 | self.dim_head = dim_head 39 | self.heads = heads 40 | inner_dim = dim_head * heads 41 | 42 | self.norm1 = nn.LayerNorm(dim) 43 | self.norm2 = nn.LayerNorm(dim) 44 | 45 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 46 | self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) 47 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 48 | 49 | def forward(self, x, latents): 50 | """ 51 | Args: 52 | x (torch.Tensor): image features 53 | shape (b, n1, D) 54 | latent (torch.Tensor): latent features 55 | shape (b, n2, D) 56 | """ 57 | x = self.norm1(x) 58 | latents = self.norm2(latents) 59 | 60 | b, l, _ = latents.shape 61 | 62 | q = self.to_q(latents) 63 | kv_input = torch.cat((x, latents), dim=-2) 64 | k, v = self.to_kv(kv_input).chunk(2, dim=-1) 65 | 66 | q = reshape_tensor(q, self.heads) 67 | k = reshape_tensor(k, self.heads) 68 | v = reshape_tensor(v, self.heads) 69 | 70 | # attention 71 | scale = 1 / math.sqrt(math.sqrt(self.dim_head)) 72 | weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards 73 | weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) 74 | out = weight @ v 75 | 76 | out = out.permute(0, 2, 1, 3).reshape(b, l, -1) 77 | 78 | return self.to_out(out) 79 | 80 | 81 | class Resampler(nn.Module): 82 | def __init__( 83 | self, 84 | dim=1024, 85 | depth=8, 86 | dim_head=64, 87 | heads=16, 88 | num_queries=8, 89 | embedding_dim=768, 90 | output_dim=1024, 91 | ff_mult=4, 92 | max_seq_len: int = 257, # CLIP tokens + CLS token 93 | apply_pos_emb: bool = False, 94 | num_latents_mean_pooled: int = 0, # number of latents derived from mean pooled representation of the sequence 95 | ): 96 | super().__init__() 97 | self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None 98 | 99 | self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) 100 | 101 | self.proj_in = nn.Linear(embedding_dim, dim) 102 | 103 | self.proj_out = nn.Linear(dim, output_dim) 104 | self.norm_out = nn.LayerNorm(output_dim) 105 | 106 | self.to_latents_from_mean_pooled_seq = ( 107 | nn.Sequential( 108 | nn.LayerNorm(dim), 109 | nn.Linear(dim, dim * num_latents_mean_pooled), 110 | Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled), 111 | ) 112 | if num_latents_mean_pooled > 0 113 | else None 114 | ) 115 | 116 | self.layers = nn.ModuleList([]) 117 | for _ in range(depth): 118 | self.layers.append( 119 | nn.ModuleList( 120 | [ 121 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 122 | FeedForward(dim=dim, mult=ff_mult), 123 | ] 124 | ) 125 | ) 126 | 127 | def forward(self, x): 128 | if self.pos_emb is not None: 129 | n, device = x.shape[1], x.device 130 | pos_emb = self.pos_emb(torch.arange(n, device=device)) 131 | x = x + pos_emb 132 | 133 | latents = self.latents.repeat(x.size(0), 1, 1) 134 | 135 | x = self.proj_in(x) 136 | 137 | if self.to_latents_from_mean_pooled_seq: 138 | meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool)) 139 | meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq) 140 | latents = torch.cat((meanpooled_latents, latents), dim=-2) 141 | 142 | for attn, ff in self.layers: 143 | latents = attn(x, latents) + latents 144 | latents = ff(latents) + latents 145 | 146 | latents = self.proj_out(latents) 147 | return self.norm_out(latents) 148 | 149 | 150 | def masked_mean(t, *, dim, mask=None): 151 | if mask is None: 152 | return t.mean(dim=dim) 153 | 154 | denom = mask.sum(dim=dim, keepdim=True) 155 | mask = rearrange(mask, "b n -> b n 1") 156 | masked_t = t.masked_fill(~mask, 0.0) 157 | 158 | return masked_t.sum(dim=dim) / denom.clamp(min=1e-5) 159 | -------------------------------------------------------------------------------- /ip_adapter/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from PIL import Image 5 | 6 | attn_maps = {} 7 | def hook_fn(name): 8 | def forward_hook(module, input, output): 9 | if hasattr(module.processor, "attn_map"): 10 | attn_maps[name] = module.processor.attn_map 11 | del module.processor.attn_map 12 | 13 | return forward_hook 14 | 15 | def register_cross_attention_hook(unet): 16 | for name, module in unet.named_modules(): 17 | if name.split('.')[-1].startswith('attn2'): 18 | module.register_forward_hook(hook_fn(name)) 19 | 20 | return unet 21 | 22 | def upscale(attn_map, target_size): 23 | attn_map = torch.mean(attn_map, dim=0) 24 | attn_map = attn_map.permute(1,0) 25 | temp_size = None 26 | 27 | for i in range(0,5): 28 | scale = 2 ** i 29 | if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64: 30 | temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8)) 31 | break 32 | 33 | assert temp_size is not None, "temp_size cannot is None" 34 | 35 | attn_map = attn_map.view(attn_map.shape[0], *temp_size) 36 | 37 | attn_map = F.interpolate( 38 | attn_map.unsqueeze(0).to(dtype=torch.float32), 39 | size=target_size, 40 | mode='bilinear', 41 | align_corners=False 42 | )[0] 43 | 44 | attn_map = torch.softmax(attn_map, dim=0) 45 | return attn_map 46 | def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True): 47 | 48 | idx = 0 if instance_or_negative else 1 49 | net_attn_maps = [] 50 | 51 | for name, attn_map in attn_maps.items(): 52 | attn_map = attn_map.cpu() if detach else attn_map 53 | attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze() 54 | attn_map = upscale(attn_map, image_size) 55 | net_attn_maps.append(attn_map) 56 | 57 | net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0) 58 | 59 | return net_attn_maps 60 | 61 | def attnmaps2images(net_attn_maps): 62 | 63 | #total_attn_scores = 0 64 | images = [] 65 | 66 | for attn_map in net_attn_maps: 67 | attn_map = attn_map.cpu().numpy() 68 | #total_attn_scores += attn_map.mean().item() 69 | 70 | normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255 71 | normalized_attn_map = normalized_attn_map.astype(np.uint8) 72 | #print("norm: ", normalized_attn_map.shape) 73 | image = Image.fromarray(normalized_attn_map) 74 | 75 | #image = fix_save_attn_map(attn_map) 76 | images.append(image) 77 | 78 | #print(total_attn_scores) 79 | return images 80 | def is_torch2_available(): 81 | return hasattr(F, "scaled_dot_product_attention") 82 | 83 | def get_generator(seed, device): 84 | 85 | if seed is not None: 86 | if isinstance(seed, list): 87 | generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed] 88 | else: 89 | generator = torch.Generator(device).manual_seed(seed) 90 | else: 91 | generator = None 92 | 93 | return generator -------------------------------------------------------------------------------- /model.yaml: -------------------------------------------------------------------------------- 1 | lightning_unet: 2 | - sdxl_lightning_1step_unet_x0.safetensors #repo ByteDance/SDXL-Lightning 3 | - sdxl_lightning_2step_unet.safetensors #repo ByteDance/SDXL-Lightning 4 | - sdxl_lightning_4step_unet.safetensors 5 | - sdxl_lightning_8step_unet.safetensors 6 | - Hyper-SDXL-1step-Unet.safetensors #repo ByteDance/Hyper-SD 7 | - lcm-sdxl-base-1.0.safetensors #repo ckpt/lcm-sdxl-unet you can change model name like example 8 | - dmd2_sdxl_1step_unet_fp16.bin #repo tianweiy/DMD2 9 | - dmd2_sdxl_4step_unet_fp16.bin 10 | surport_model: 11 | - stable-diffusion-v1-5 #repo runwayml/stable-diffusion-v1-5 12 | - stable-diffusion-2-1-base #repo stabilityai/stable-diffusion-2-1-base 13 | - playground-v2-1024px-aesthetic #repo playgroundai/playground-v2-1024px-aesthetic 14 | - Ghibli-Diffusion #repo nitrosocke/Ghibli-Diffusion 15 | surport_controlnet: 16 | - controlnet-canny-sdxl-1.0 #repo diffusers/controlnet-canny-sdxl-1.0 17 | - MistoLine #repo TheMistoAI/MistoLine 18 | - controlnet-openpose-sdxl-1.0 19 | - controlnet-scribble-sdxl-1.0 20 | - stable-diffusion-xl-1.0-inpainting-0.1 21 | - controlnet-tile-sdxl-1.0 22 | sdxl_model: 23 | - stable-diffusion-xl-base-1.0 #repo stabilityai/stable-diffusion-xl-base-1.0 24 | - sdxl-flash #repo sd-community/sdxl-flash 25 | lightning_lora: 26 | - Hyper-SD15-12steps-CFG-lora.safetensors 27 | - Hyper-SD15-1step-lora.safetensors 28 | - Hyper-SD15-2steps-lora.safetensors 29 | - Hyper-SD15-4steps-lora.safetensors 30 | - Hyper-SD15-8steps-CFG-lora.safetensors 31 | - Hyper-SD15-8steps-lora.safetensors 32 | - pcm_sd15_lcmlike_lora_converted.safetensors 33 | - pcm_sd15_normalcfg_16step_converted.safetensors 34 | - pcm_sd15_normalcfg_4step_converted.safetensors 35 | - pcm_sd15_smallcfg_16step_converted.safetensor 36 | - pcm_sd15_smallcfg_2step_converted.safetensors 37 | - pcm_sd15_smallcfg_4step_converted.safetensors 38 | - pcm_sd15_smallcfg_8step_converted.safetensors 39 | - lcm-lora-sdv1-5.safetensors 40 | - TCD-SD15-LoRA.safetensors #need rename and TCD 41 | lightning_xl_lora: 42 | - Hyper-SDXL-12steps-CFG-lora.safetensors 43 | - Hyper-SDXL-1step-lora.safetensors 44 | - Hyper-SDXL-2step-lora.safetensors 45 | - Hyper-SDXL-4step-lora.safetensors 46 | - Hyper-SDXL-8step-lora.safetensors 47 | - Hyper-SDXL-8steps-CFG-lora.safetensors 48 | - sdxl_lightning_2step_lora.safetensors 49 | - sdxl_lightning_4step_lora.safetensors 50 | - sdxl_lightning_8step_lora.safetensors 51 | - pcm_sdxl_lcmlike_lora_converted.safetensors 52 | - pcm_sdxl_normalcfg_16step_converted.safetensors 53 | - pcm_sdxl_normalcfg_4step_converted.safetensors 54 | - pcm_sdxl_normalcfg_8step_converted.safetensors 55 | - pcm_sdxl_smallcfg_16step_converted.safetensors 56 | - pcm_sdxl_smallcfg_2step_converted.safetensors 57 | - pcm_sdxl_smallcfg_4step_converted.safetensors 58 | - pcm_sdxl_smallcfg_8step_converted.safetensors 59 | - lcm-lora-sdxl.safetensors 60 | - dmd2_sdxl_4step_lora.safetensors 61 | - dmd2_sdxl_4step_lora_fp16.safetensors 62 | - TCD-SDXL-LoRA.safetensors #need rename and TCD 63 | - manne_turbo.safetensors 64 | 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui_hidiffusion_pro" 3 | description = "A HiDiffusion node for ComfyUI." 4 | version = "1.0.1" 5 | license = { file = "LICENSE" } 6 | 7 | [project.urls] 8 | Repository = "https://github.com/smthemex/ComfyUI_HiDiffusion_Pro" 9 | # Used by Comfy Registry https://comfyregistry.org 10 | 11 | [tool.comfy] 12 | PublisherId = "smthemex" 13 | DisplayName = "ComfyUI_HiDiffusion_Pro" 14 | Icon = "" 15 | -------------------------------------------------------------------------------- /sd15_config/feature_extractor/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": { 3 | "height": 224, 4 | "width": 224 5 | }, 6 | "do_center_crop": true, 7 | "do_convert_rgb": true, 8 | "do_normalize": true, 9 | "do_rescale": true, 10 | "do_resize": true, 11 | "feature_extractor_type": "CLIPFeatureExtractor", 12 | "image_mean": [ 13 | 0.48145466, 14 | 0.4578275, 15 | 0.40821073 16 | ], 17 | "image_processor_type": "CLIPFeatureExtractor", 18 | "image_std": [ 19 | 0.26862954, 20 | 0.26130258, 21 | 0.27577711 22 | ], 23 | "resample": 3, 24 | "rescale_factor": 0.00392156862745098, 25 | "size": { 26 | "shortest_edge": 224 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /sd15_config/model_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "StableDiffusionPipeline", 3 | "_diffusers_version": "0.21.0.dev0", 4 | "_name_or_path": "lykon-models/dreamshaper-8", 5 | "feature_extractor": [ 6 | "transformers", 7 | "CLIPFeatureExtractor" 8 | ], 9 | "requires_safety_checker": true, 10 | "safety_checker": [ 11 | "stable_diffusion", 12 | "StableDiffusionSafetyChecker" 13 | ], 14 | "scheduler": [ 15 | "diffusers", 16 | "DEISMultistepScheduler" 17 | ], 18 | "text_encoder": [ 19 | "transformers", 20 | "CLIPTextModel" 21 | ], 22 | "tokenizer": [ 23 | "transformers", 24 | "CLIPTokenizer" 25 | ], 26 | "unet": [ 27 | "diffusers", 28 | "UNet2DConditionModel" 29 | ], 30 | "vae": [ 31 | "diffusers", 32 | "AutoencoderKL" 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /sd15_config/safety_checker/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/safety_checker", 3 | "architectures": [ 4 | "StableDiffusionSafetyChecker" 5 | ], 6 | "initializer_factor": 1.0, 7 | "logit_scale_init_value": 2.6592, 8 | "model_type": "clip", 9 | "projection_dim": 768, 10 | "text_config": { 11 | "dropout": 0.0, 12 | "hidden_size": 768, 13 | "intermediate_size": 3072, 14 | "model_type": "clip_text_model", 15 | "num_attention_heads": 12 16 | }, 17 | "torch_dtype": "float16", 18 | "transformers_version": "4.33.0.dev0", 19 | "vision_config": { 20 | "dropout": 0.0, 21 | "hidden_size": 1024, 22 | "intermediate_size": 4096, 23 | "model_type": "clip_vision_model", 24 | "num_attention_heads": 16, 25 | "num_hidden_layers": 24, 26 | "patch_size": 14 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /sd15_config/scheduler/scheduler_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "DEISMultistepScheduler", 3 | "_diffusers_version": "0.21.0.dev0", 4 | "algorithm_type": "deis", 5 | "beta_end": 0.012, 6 | "beta_schedule": "scaled_linear", 7 | "beta_start": 0.00085, 8 | "clip_sample": false, 9 | "dynamic_thresholding_ratio": 0.995, 10 | "lower_order_final": true, 11 | "num_train_timesteps": 1000, 12 | "prediction_type": "epsilon", 13 | "sample_max_value": 1.0, 14 | "set_alpha_to_one": false, 15 | "skip_prk_steps": true, 16 | "solver_order": 2, 17 | "solver_type": "logrho", 18 | "steps_offset": 1, 19 | "thresholding": false, 20 | "timestep_spacing": "leading", 21 | "trained_betas": null, 22 | "use_karras_sigmas": false 23 | } 24 | -------------------------------------------------------------------------------- /sd15_config/text_encoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/text_encoder", 3 | "architectures": [ 4 | "CLIPTextModel" 5 | ], 6 | "attention_dropout": 0.0, 7 | "bos_token_id": 0, 8 | "dropout": 0.0, 9 | "eos_token_id": 2, 10 | "hidden_act": "quick_gelu", 11 | "hidden_size": 768, 12 | "initializer_factor": 1.0, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 3072, 15 | "layer_norm_eps": 1e-05, 16 | "max_position_embeddings": 77, 17 | "model_type": "clip_text_model", 18 | "num_attention_heads": 12, 19 | "num_hidden_layers": 12, 20 | "pad_token_id": 1, 21 | "projection_dim": 768, 22 | "torch_dtype": "float16", 23 | "transformers_version": "4.33.0.dev0", 24 | "vocab_size": 49408 25 | } 26 | -------------------------------------------------------------------------------- /sd15_config/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "<|startoftext|>", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "<|endoftext|>", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "pad_token": "<|endoftext|>", 17 | "unk_token": { 18 | "content": "<|endoftext|>", 19 | "lstrip": false, 20 | "normalized": true, 21 | "rstrip": false, 22 | "single_word": false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /sd15_config/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": { 4 | "__type": "AddedToken", 5 | "content": "<|startoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false 10 | }, 11 | "clean_up_tokenization_spaces": true, 12 | "do_lower_case": true, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "<|endoftext|>", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "errors": "replace", 22 | "model_max_length": 77, 23 | "pad_token": "<|endoftext|>", 24 | "tokenizer_class": "CLIPTokenizer", 25 | "unk_token": { 26 | "__type": "AddedToken", 27 | "content": "<|endoftext|>", 28 | "lstrip": false, 29 | "normalized": true, 30 | "rstrip": false, 31 | "single_word": false 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sd15_config/unet/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "UNet2DConditionModel", 3 | "_diffusers_version": "0.21.0.dev0", 4 | "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/unet", 5 | "act_fn": "silu", 6 | "addition_embed_type": null, 7 | "addition_embed_type_num_heads": 64, 8 | "addition_time_embed_dim": null, 9 | "attention_head_dim": 8, 10 | "attention_type": "default", 11 | "block_out_channels": [ 12 | 320, 13 | 640, 14 | 1280, 15 | 1280 16 | ], 17 | "center_input_sample": false, 18 | "class_embed_type": null, 19 | "class_embeddings_concat": false, 20 | "conv_in_kernel": 3, 21 | "conv_out_kernel": 3, 22 | "cross_attention_dim": 768, 23 | "cross_attention_norm": null, 24 | "down_block_types": [ 25 | "CrossAttnDownBlock2D", 26 | "CrossAttnDownBlock2D", 27 | "CrossAttnDownBlock2D", 28 | "DownBlock2D" 29 | ], 30 | "downsample_padding": 1, 31 | "dual_cross_attention": false, 32 | "encoder_hid_dim": null, 33 | "encoder_hid_dim_type": null, 34 | "flip_sin_to_cos": true, 35 | "freq_shift": 0, 36 | "in_channels": 4, 37 | "layers_per_block": 2, 38 | "mid_block_only_cross_attention": null, 39 | "mid_block_scale_factor": 1, 40 | "mid_block_type": "UNetMidBlock2DCrossAttn", 41 | "norm_eps": 1e-05, 42 | "norm_num_groups": 32, 43 | "num_attention_heads": null, 44 | "num_class_embeds": null, 45 | "only_cross_attention": false, 46 | "out_channels": 4, 47 | "projection_class_embeddings_input_dim": null, 48 | "resnet_out_scale_factor": 1.0, 49 | "resnet_skip_time_act": false, 50 | "resnet_time_scale_shift": "default", 51 | "sample_size": 64, 52 | "time_cond_proj_dim": null, 53 | "time_embedding_act_fn": null, 54 | "time_embedding_dim": null, 55 | "time_embedding_type": "positional", 56 | "timestep_post_act": null, 57 | "transformer_layers_per_block": 1, 58 | "up_block_types": [ 59 | "UpBlock2D", 60 | "CrossAttnUpBlock2D", 61 | "CrossAttnUpBlock2D", 62 | "CrossAttnUpBlock2D" 63 | ], 64 | "upcast_attention": null, 65 | "use_linear_projection": false 66 | } 67 | -------------------------------------------------------------------------------- /sd15_config/vae/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.21.0.dev0", 4 | "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/vae", 5 | "act_fn": "silu", 6 | "block_out_channels": [ 7 | 128, 8 | 256, 9 | 512, 10 | 512 11 | ], 12 | "down_block_types": [ 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D", 16 | "DownEncoderBlock2D" 17 | ], 18 | "force_upcast": true, 19 | "in_channels": 3, 20 | "latent_channels": 4, 21 | "layers_per_block": 2, 22 | "norm_num_groups": 32, 23 | "out_channels": 3, 24 | "sample_size": 512, 25 | "scaling_factor": 0.18215, 26 | "up_block_types": [ 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D", 29 | "UpDecoderBlock2D", 30 | "UpDecoderBlock2D" 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /sdxl_config/model_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "StableDiffusionXLPipeline", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "force_zeros_for_empty_prompt": true, 5 | "add_watermarker": null, 6 | "scheduler": [ 7 | "diffusers", 8 | "EulerDiscreteScheduler" 9 | ], 10 | "text_encoder": [ 11 | "transformers", 12 | "CLIPTextModel" 13 | ], 14 | "text_encoder_2": [ 15 | "transformers", 16 | "CLIPTextModelWithProjection" 17 | ], 18 | "tokenizer": [ 19 | "transformers", 20 | "CLIPTokenizer" 21 | ], 22 | "tokenizer_2": [ 23 | "transformers", 24 | "CLIPTokenizer" 25 | ], 26 | "unet": [ 27 | "diffusers", 28 | "UNet2DConditionModel" 29 | ], 30 | "vae": [ 31 | "diffusers", 32 | "AutoencoderKL" 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /sdxl_config/scheduler/scheduler_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "EulerDiscreteScheduler", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "beta_end": 0.012, 5 | "beta_schedule": "scaled_linear", 6 | "beta_start": 0.00085, 7 | "clip_sample": false, 8 | "interpolation_type": "linear", 9 | "num_train_timesteps": 1000, 10 | "prediction_type": "epsilon", 11 | "sample_max_value": 1.0, 12 | "set_alpha_to_one": false, 13 | "skip_prk_steps": true, 14 | "steps_offset": 1, 15 | "timestep_spacing": "leading", 16 | "trained_betas": null, 17 | "use_karras_sigmas": false 18 | } 19 | -------------------------------------------------------------------------------- /sdxl_config/text_encoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "CLIPTextModel" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 0, 7 | "dropout": 0.0, 8 | "eos_token_id": 2, 9 | "hidden_act": "quick_gelu", 10 | "hidden_size": 768, 11 | "initializer_factor": 1.0, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 3072, 14 | "layer_norm_eps": 1e-05, 15 | "max_position_embeddings": 77, 16 | "model_type": "clip_text_model", 17 | "num_attention_heads": 12, 18 | "num_hidden_layers": 12, 19 | "pad_token_id": 1, 20 | "projection_dim": 768, 21 | "torch_dtype": "float16", 22 | "transformers_version": "4.32.0.dev0", 23 | "vocab_size": 49408 24 | } 25 | -------------------------------------------------------------------------------- /sdxl_config/text_encoder_2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "CLIPTextModelWithProjection" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 0, 7 | "dropout": 0.0, 8 | "eos_token_id": 2, 9 | "hidden_act": "gelu", 10 | "hidden_size": 1280, 11 | "initializer_factor": 1.0, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 5120, 14 | "layer_norm_eps": 1e-05, 15 | "max_position_embeddings": 77, 16 | "model_type": "clip_text_model", 17 | "num_attention_heads": 20, 18 | "num_hidden_layers": 32, 19 | "pad_token_id": 1, 20 | "projection_dim": 1280, 21 | "torch_dtype": "float16", 22 | "transformers_version": "4.32.0.dev0", 23 | "vocab_size": 49408 24 | } 25 | -------------------------------------------------------------------------------- /sdxl_config/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "<|startoftext|>", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "<|endoftext|>", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "pad_token": "<|endoftext|>", 17 | "unk_token": { 18 | "content": "<|endoftext|>", 19 | "lstrip": false, 20 | "normalized": true, 21 | "rstrip": false, 22 | "single_word": false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /sdxl_config/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": { 4 | "__type": "AddedToken", 5 | "content": "<|startoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false 10 | }, 11 | "clean_up_tokenization_spaces": true, 12 | "do_lower_case": true, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "<|endoftext|>", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "errors": "replace", 22 | "model_max_length": 77, 23 | "pad_token": "<|endoftext|>", 24 | "tokenizer_class": "CLIPTokenizer", 25 | "unk_token": { 26 | "__type": "AddedToken", 27 | "content": "<|endoftext|>", 28 | "lstrip": false, 29 | "normalized": true, 30 | "rstrip": false, 31 | "single_word": false 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sdxl_config/tokenizer_2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "<|startoftext|>", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "<|endoftext|>", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "pad_token": "!", 17 | "unk_token": { 18 | "content": "<|endoftext|>", 19 | "lstrip": false, 20 | "normalized": true, 21 | "rstrip": false, 22 | "single_word": false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /sdxl_config/tokenizer_2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": { 4 | "__type": "AddedToken", 5 | "content": "<|startoftext|>", 6 | "lstrip": false, 7 | "normalized": true, 8 | "rstrip": false, 9 | "single_word": false 10 | }, 11 | "clean_up_tokenization_spaces": true, 12 | "do_lower_case": true, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "<|endoftext|>", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "errors": "replace", 22 | "model_max_length": 77, 23 | "pad_token": "!", 24 | "tokenizer_class": "CLIPTokenizer", 25 | "unk_token": { 26 | "__type": "AddedToken", 27 | "content": "<|endoftext|>", 28 | "lstrip": false, 29 | "normalized": true, 30 | "rstrip": false, 31 | "single_word": false 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sdxl_config/unet/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "UNet2DConditionModel", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "act_fn": "silu", 5 | "addition_embed_type": "text_time", 6 | "addition_embed_type_num_heads": 64, 7 | "addition_time_embed_dim": 256, 8 | "attention_head_dim": [ 9 | 5, 10 | 10, 11 | 20 12 | ], 13 | "block_out_channels": [ 14 | 320, 15 | 640, 16 | 1280 17 | ], 18 | "center_input_sample": false, 19 | "class_embed_type": null, 20 | "class_embeddings_concat": false, 21 | "conv_in_kernel": 3, 22 | "conv_out_kernel": 3, 23 | "cross_attention_dim": 2048, 24 | "cross_attention_norm": null, 25 | "down_block_types": [ 26 | "DownBlock2D", 27 | "CrossAttnDownBlock2D", 28 | "CrossAttnDownBlock2D" 29 | ], 30 | "downsample_padding": 1, 31 | "dual_cross_attention": false, 32 | "encoder_hid_dim": null, 33 | "encoder_hid_dim_type": null, 34 | "flip_sin_to_cos": true, 35 | "freq_shift": 0, 36 | "in_channels": 4, 37 | "layers_per_block": 2, 38 | "mid_block_only_cross_attention": null, 39 | "mid_block_scale_factor": 1, 40 | "mid_block_type": "UNetMidBlock2DCrossAttn", 41 | "norm_eps": 1e-05, 42 | "norm_num_groups": 32, 43 | "num_attention_heads": null, 44 | "num_class_embeds": null, 45 | "only_cross_attention": false, 46 | "out_channels": 4, 47 | "projection_class_embeddings_input_dim": 2816, 48 | "resnet_out_scale_factor": 1.0, 49 | "resnet_skip_time_act": false, 50 | "resnet_time_scale_shift": "default", 51 | "sample_size": 128, 52 | "time_cond_proj_dim": null, 53 | "time_embedding_act_fn": null, 54 | "time_embedding_dim": null, 55 | "time_embedding_type": "positional", 56 | "timestep_post_act": null, 57 | "transformer_layers_per_block": [ 58 | 1, 59 | 2, 60 | 10 61 | ], 62 | "up_block_types": [ 63 | "CrossAttnUpBlock2D", 64 | "CrossAttnUpBlock2D", 65 | "UpBlock2D" 66 | ], 67 | "upcast_attention": null, 68 | "use_linear_projection": true 69 | } 70 | -------------------------------------------------------------------------------- /sdxl_config/vae/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.20.0.dev0", 4 | "_name_or_path": "../sdxl-vae/", 5 | "act_fn": "silu", 6 | "block_out_channels": [ 7 | 128, 8 | 256, 9 | 512, 10 | 512 11 | ], 12 | "down_block_types": [ 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D", 16 | "DownEncoderBlock2D" 17 | ], 18 | "force_upcast": true, 19 | "in_channels": 3, 20 | "latent_channels": 4, 21 | "layers_per_block": 2, 22 | "norm_num_groups": 32, 23 | "out_channels": 3, 24 | "sample_size": 1024, 25 | "scaling_factor": 0.13025, 26 | "up_block_types": [ 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D", 29 | "UpDecoderBlock2D", 30 | "UpDecoderBlock2D" 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /sdxl_config/vae_1_0/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "act_fn": "silu", 5 | "block_out_channels": [ 6 | 128, 7 | 256, 8 | 512, 9 | 512 10 | ], 11 | "down_block_types": [ 12 | "DownEncoderBlock2D", 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D" 16 | ], 17 | "force_upcast": true, 18 | "in_channels": 3, 19 | "latent_channels": 4, 20 | "layers_per_block": 2, 21 | "norm_num_groups": 32, 22 | "out_channels": 3, 23 | "sample_size": 1024, 24 | "scaling_factor": 0.13025, 25 | "up_block_types": [ 26 | "UpDecoderBlock2D", 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D", 29 | "UpDecoderBlock2D" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /sdxl_config/vae_decoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "act_fn": "silu", 5 | "block_out_channels": [ 6 | 128, 7 | 256, 8 | 512, 9 | 512 10 | ], 11 | "down_block_types": [ 12 | "DownEncoderBlock2D", 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D" 16 | ], 17 | "force_upcast": true, 18 | "in_channels": 3, 19 | "latent_channels": 4, 20 | "layers_per_block": 2, 21 | "norm_num_groups": 32, 22 | "out_channels": 3, 23 | "sample_size": 1024, 24 | "scaling_factor": 0.13025, 25 | "up_block_types": [ 26 | "UpDecoderBlock2D", 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D", 29 | "UpDecoderBlock2D" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /sdxl_config/vae_encoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.19.0.dev0", 4 | "act_fn": "silu", 5 | "block_out_channels": [ 6 | 128, 7 | 256, 8 | 512, 9 | 512 10 | ], 11 | "down_block_types": [ 12 | "DownEncoderBlock2D", 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D" 16 | ], 17 | "force_upcast": true, 18 | "in_channels": 3, 19 | "latent_channels": 4, 20 | "layers_per_block": 2, 21 | "norm_num_groups": 32, 22 | "out_channels": 3, 23 | "sample_size": 1024, 24 | "scaling_factor": 0.13025, 25 | "up_block_types": [ 26 | "UpDecoderBlock2D", 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D", 29 | "UpDecoderBlock2D" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /weights/playground/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKL", 3 | "_diffusers_version": "0.27.0.dev0", 4 | "act_fn": "silu", 5 | "block_out_channels": [ 6 | 128, 7 | 256, 8 | 512, 9 | 512 10 | ], 11 | "down_block_types": [ 12 | "DownEncoderBlock2D", 13 | "DownEncoderBlock2D", 14 | "DownEncoderBlock2D", 15 | "DownEncoderBlock2D" 16 | ], 17 | "force_upcast": true, 18 | "in_channels": 3, 19 | "latent_channels": 4, 20 | "layers_per_block": 2, 21 | "norm_num_groups": 32, 22 | "out_channels": 3, 23 | "sample_size": 1024, 24 | "up_block_types": [ 25 | "UpDecoderBlock2D", 26 | "UpDecoderBlock2D", 27 | "UpDecoderBlock2D", 28 | "UpDecoderBlock2D" 29 | ], 30 | "latents_mean": [ 31 | -1.6574, 32 | 1.886, 33 | -1.383, 34 | 2.5155 35 | ], 36 | "latents_std": [ 37 | 8.4927, 38 | 5.9022, 39 | 6.5498, 40 | 5.2299 41 | ], 42 | "scaling_factor": 0.5 43 | } 44 | -------------------------------------------------------------------------------- /weights/sd15/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "./image_encoder", 3 | "architectures": [ 4 | "CLIPVisionModelWithProjection" 5 | ], 6 | "attention_dropout": 0.0, 7 | "dropout": 0.0, 8 | "hidden_act": "gelu", 9 | "hidden_size": 1280, 10 | "image_size": 224, 11 | "initializer_factor": 1.0, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 5120, 14 | "layer_norm_eps": 1e-05, 15 | "model_type": "clip_vision_model", 16 | "num_attention_heads": 16, 17 | "num_channels": 3, 18 | "num_hidden_layers": 32, 19 | "patch_size": 14, 20 | "projection_dim": 1024, 21 | "torch_dtype": "float16", 22 | "transformers_version": "4.28.0.dev0" 23 | } 24 | -------------------------------------------------------------------------------- /weights/sd_xl_base.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | target: sgm.models.diffusion.DiffusionEngine 3 | params: 4 | scale_factor: 0.13025 5 | disable_first_stage_autocast: True 6 | 7 | denoiser_config: 8 | target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser 9 | params: 10 | num_idx: 1000 11 | 12 | scaling_config: 13 | target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling 14 | discretization_config: 15 | target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization 16 | 17 | network_config: 18 | target: sgm.modules.diffusionmodules.openaimodel.UNetModel 19 | params: 20 | adm_in_channels: 2816 21 | num_classes: sequential 22 | use_checkpoint: True 23 | in_channels: 4 24 | out_channels: 4 25 | model_channels: 320 26 | attention_resolutions: [4, 2] 27 | num_res_blocks: 2 28 | channel_mult: [1, 2, 4] 29 | num_head_channels: 64 30 | use_linear_in_transformer: True 31 | transformer_depth: [1, 2, 10] 32 | context_dim: 2048 33 | spatial_transformer_attn_type: softmax-xformers 34 | 35 | conditioner_config: 36 | target: sgm.modules.GeneralConditioner 37 | params: 38 | emb_models: 39 | - is_trainable: False 40 | input_key: txt 41 | target: sgm.modules.encoders.modules.FrozenCLIPEmbedder 42 | params: 43 | layer: hidden 44 | layer_idx: 11 45 | 46 | - is_trainable: False 47 | input_key: txt 48 | target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 49 | params: 50 | arch: ViT-bigG-14 51 | version: laion2b_s39b_b160k 52 | freeze: True 53 | layer: penultimate 54 | always_return_pooled: True 55 | legacy: False 56 | 57 | - is_trainable: False 58 | input_key: original_size_as_tuple 59 | target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND 60 | params: 61 | outdim: 256 62 | 63 | - is_trainable: False 64 | input_key: crop_coords_top_left 65 | target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND 66 | params: 67 | outdim: 256 68 | 69 | - is_trainable: False 70 | input_key: target_size_as_tuple 71 | target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND 72 | params: 73 | outdim: 256 74 | 75 | first_stage_config: 76 | target: sgm.models.autoencoder.AutoencoderKL 77 | params: 78 | embed_dim: 4 79 | monitor: val/rec_loss 80 | ddconfig: 81 | attn_type: vanilla-xformers 82 | double_z: true 83 | z_channels: 4 84 | resolution: 256 85 | in_channels: 3 86 | out_ch: 3 87 | ch: 128 88 | ch_mult: [1, 2, 4, 4] 89 | num_res_blocks: 2 90 | attn_resolutions: [] 91 | dropout: 0.0 92 | lossconfig: 93 | target: torch.nn.Identity 94 | -------------------------------------------------------------------------------- /weights/sdxl/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "CLIPVisionModelWithProjection" 4 | ], 5 | "_name_or_path": "", 6 | "add_cross_attention": false, 7 | "architectures_": null, 8 | "attention_dropout": 0.0, 9 | "bad_words_ids": null, 10 | "begin_suppress_tokens": null, 11 | "bos_token_id": null, 12 | "chunk_size_feed_forward": 0, 13 | "cross_attention_hidden_size": null, 14 | "decoder_start_token_id": null, 15 | "diversity_penalty": 0.0, 16 | "do_sample": false, 17 | "dropout": 0.0, 18 | "early_stopping": false, 19 | "encoder_no_repeat_ngram_size": 0, 20 | "eos_token_id": null, 21 | "exponential_decay_length_penalty": null, 22 | "finetuning_task": null, 23 | "forced_bos_token_id": null, 24 | "forced_eos_token_id": null, 25 | "hidden_act": "gelu", 26 | "hidden_size": 1664, 27 | "id2label": { 28 | "0": "LABEL_0", 29 | "1": "LABEL_1" 30 | }, 31 | "image_size": 224, 32 | "initializer_factor": 1.0, 33 | "initializer_range": 0.02, 34 | "intermediate_size": 8192, 35 | "is_decoder": false, 36 | "is_encoder_decoder": false, 37 | "label2id": { 38 | "LABEL_0": 0, 39 | "LABEL_1": 1 40 | }, 41 | "layer_norm_eps": 1e-05, 42 | "length_penalty": 1.0, 43 | "max_length": 20, 44 | "min_length": 0, 45 | "model_type": "clip_vision_model", 46 | "no_repeat_ngram_size": 0, 47 | "num_attention_heads": 16, 48 | "num_beam_groups": 1, 49 | "num_beams": 1, 50 | "num_channels": 3, 51 | "num_hidden_layers": 48, 52 | "num_return_sequences": 1, 53 | "output_attentions": false, 54 | "output_hidden_states": false, 55 | "output_scores": false, 56 | "pad_token_id": null, 57 | "patch_size": 14, 58 | "prefix": null, 59 | "problem_type": null, 60 | "pruned_heads": {}, 61 | "remove_invalid_values": false, 62 | "repetition_penalty": 1.0, 63 | "return_dict": true, 64 | "return_dict_in_generate": false, 65 | "sep_token_id": null, 66 | "suppress_tokens": null, 67 | "task_specific_params": null, 68 | "temperature": 1.0, 69 | "tf_legacy_loss": false, 70 | "tie_encoder_decoder": false, 71 | "tie_word_embeddings": true, 72 | "tokenizer_class": null, 73 | "top_k": 50, 74 | "top_p": 1.0, 75 | "torch_dtype": null, 76 | "torchscript": false, 77 | "transformers_version": "4.24.0", 78 | "typical_p": 1.0, 79 | "use_bfloat16": false, 80 | "projection_dim": 1280 81 | } 82 | --------------------------------------------------------------------------------