├── Hidiffusion_node.py
├── LICENSE
├── README.md
├── __init__.py
├── example
    ├── controlnet_img2img1.png
    ├── img2img_lora1.png
    ├── lightingUnet1.png
    ├── new.json
    ├── new.png
    └── sd15ipstyle1.png
├── guided_filter.py
├── hidiffusion
    ├── __init__.py
    ├── hidiffusion.py
    ├── sd_module_key
    │   ├── sd15_module_key.txt
    │   └── sdxl_module_key.txt
    └── utils.py
├── ip_adapter
    ├── __init__.py
    ├── attention_processor.py
    ├── ip_adapter.py
    ├── resampler.py
    └── utils.py
├── model.yaml
├── pyproject.toml
├── sd15_config
    ├── feature_extractor
    │   └── preprocessor_config.json
    ├── model_index.json
    ├── safety_checker
    │   └── config.json
    ├── scheduler
    │   └── scheduler_config.json
    ├── text_encoder
    │   └── config.json
    ├── tokenizer
    │   ├── merges.txt
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   └── vocab.json
    ├── unet
    │   └── config.json
    └── vae
    │   └── config.json
├── sdxl_config
    ├── model_index.json
    ├── scheduler
    │   └── scheduler_config.json
    ├── text_encoder
    │   └── config.json
    ├── text_encoder_2
    │   └── config.json
    ├── tokenizer
    │   ├── merges.txt
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   └── vocab.json
    ├── tokenizer_2
    │   ├── merges.txt
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   └── vocab.json
    ├── unet
    │   └── config.json
    ├── vae
    │   └── config.json
    ├── vae_1_0
    │   └── config.json
    ├── vae_decoder
    │   └── config.json
    └── vae_encoder
    │   └── config.json
└── weights
    ├── playground
        └── config.json
    ├── sd15
        └── config.json
    ├── sd_xl_base.yaml
    └── sdxl
        └── config.json


/Hidiffusion_node.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | import cv2
  4 | import torch
  5 | import os
  6 | from PIL import Image
  7 | import numpy as np
  8 | from diffusers import (StableDiffusionXLPipeline, DiffusionPipeline, DDIMScheduler, ControlNetModel,
  9 |                        KDPM2AncestralDiscreteScheduler, LMSDiscreteScheduler,
 10 |                        AutoPipelineForInpainting, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler,
 11 |                        EulerDiscreteScheduler, HeunDiscreteScheduler, UNet2DConditionModel,
 12 |                        StableDiffusionXLImg2ImgPipeline, AutoPipelineForImage2Image,
 13 |                        AutoPipelineForText2Image, StableDiffusionXLControlNetImg2ImgPipeline, KDPM2DiscreteScheduler,
 14 |                        EulerAncestralDiscreteScheduler, UniPCMultistepScheduler, AutoencoderKL,
 15 |                        StableDiffusionXLControlNetPipeline, DDPMScheduler, TCDScheduler, LCMScheduler,
 16 |                        StableDiffusionPipeline, StableDiffusionControlNetPipeline, StableDiffusionXLInpaintPipeline)
 17 | try:
 18 |     from diffusers.loaders.single_file_utils import load_single_file_checkpoint,infer_diffusers_model_type
 19 | except:
 20 |     from diffusers.loaders.single_file_utils import load_single_file_model_checkpoint as load_single_file_checkpoint,infer_model_type as infer_diffusers_model_type
 21 | from .hidiffusion.hidiffusion import apply_hidiffusion,remove_hidiffusion
 22 | import folder_paths
 23 | from safetensors.torch import load_file
 24 | import yaml
 25 | import diffusers
 26 | import random
 27 | from omegaconf import OmegaConf
 28 | from comfy.model_management import cleanup_models
 29 | from comfy.clip_vision import load as load_clip
 30 | 
 31 | dif_version = str(diffusers.__version__)
 32 | dif_version_int = int(dif_version.split(".")[1])
 33 | if dif_version_int >= 28:
 34 |     from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
 35 | else:
 36 |     from diffusers.models.unet_2d_condition import UNet2DConditionModel
 37 | from comfy.utils import common_upscale
 38 | from .guided_filter import FastGuidedFilter
 39 | from .ip_adapter import IPAdapterXL,IPAdapter
 40 | 
 41 | dir_path = os.path.dirname(os.path.abspath(__file__))
 42 | path_dir = os.path.dirname(dir_path)
 43 | file_path = os.path.dirname(path_dir)
 44 | 
 45 | 
 46 | scheduler_list = ["DDIM",
 47 |     "Euler",
 48 |     "Euler a",
 49 |     "DDPM",
 50 |     "DPM++ 2M",
 51 |     "DPM++ 2M Karras",
 52 |     "DPM++ 2M SDE",
 53 |     "DPM++ 2M SDE Karras",
 54 |     "DPM++ SDE",
 55 |     "DPM++ SDE Karras",
 56 |     "DPM2",
 57 |     "DPM2 Karras",
 58 |     "DPM2 a",
 59 |     "DPM2 a Karras",
 60 |     "Heun",
 61 |     "LCM",
 62 |     "LMS",
 63 |     "LMS Karras",
 64 |     "UniPC",
 65 | ]
 66 | 
 67 | fs = open(os.path.join(dir_path, "model.yaml"), encoding="UTF-8")
 68 | datas = yaml.load(fs, Loader=yaml.FullLoader)
 69 | 
 70 | normal_model_list = datas["surport_model"]
 71 | sdxl_lightning_list = datas["lightning_unet"]
 72 | controlnet_suport = datas["surport_controlnet"]
 73 | xl_model_support = datas["sdxl_model"]
 74 | lightning_lora=datas["lightning_lora"]
 75 | lightning_xl_lora=datas["lightning_xl_lora"]
 76 | 
 77 | lcm_unet = ["dmd2_sdxl_4step_unet_fp16.bin", "dmd2_sdxl_1step_unet_fp16.bin", "lcm-sdxl-base-1.0.safetensors",
 78 |             "Hyper-SDXL-1step-Unet.safetensors"]
 79 | 
 80 | def tensor_to_image(tensor):
 81 |     image_np = tensor.squeeze().mul(255).clamp(0, 255).byte().numpy()
 82 |     image = Image.fromarray(image_np, mode='RGB')
 83 |     return image
 84 | 
 85 | def nomarl_upscale(img_tensor, width, height):
 86 |     samples = img_tensor.movedim(-1, 1)
 87 |     img = common_upscale(samples, width, height, "nearest-exact", "center")
 88 |     samples = img.movedim(1, -1)
 89 |     img_pil = tensor_to_image(samples)
 90 |     return img_pil
 91 | 
 92 | def resize_image_control(control_image, resolution):
 93 |     HH, WW, _ = control_image.shape
 94 |     crop_h = random.randint(0, HH - resolution[1])
 95 |     crop_w = random.randint(0, WW - resolution[0])
 96 |     crop_image = control_image[crop_h:crop_h+resolution[1], crop_w:crop_w+resolution[0], :]
 97 |     return crop_image, crop_w, crop_h
 98 | 
 99 | def apply_gaussian_blur(image_np, ksize=5, sigmaX=1.0):
100 |     if ksize % 2 == 0:
101 |         ksize += 1  # ksize must be odd
102 |     blurred_image = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX)
103 |     return blurred_image
104 | 
105 | def apply_guided_filter(image_np, radius, eps, scale):
106 |     filter = FastGuidedFilter(image_np, radius, eps, scale)
107 |     return filter.filter(image_np)
108 | 
109 | def input_size_adaptation_output(img_tensor,base_in, width, height):
110 |     #basein=1024
111 |     if width == height:
112 |         img_pil = nomarl_upscale(img_tensor, base_in, base_in)  # 2pil
113 |     else:
114 |         if min(1,width/ height)==1: #高
115 |             r=height/base_in
116 |             img_pil = nomarl_upscale(img_tensor, round(width/r), base_in)  # 2pil
117 |         else: #宽
118 |             r=width/base_in
119 |             img_pil = nomarl_upscale(img_tensor, base_in, round(height/r))  # 2pil
120 |     return img_pil
121 | 
122 | def get_sheduler(name):
123 |     scheduler = False
124 |     if name == "Euler":
125 |         scheduler = EulerDiscreteScheduler()
126 |     elif name == "Euler a":
127 |         scheduler = EulerAncestralDiscreteScheduler()
128 |     elif name == "DDIM":
129 |         scheduler = DDIMScheduler()
130 |     elif name == "DDPM":
131 |         scheduler = DDPMScheduler()
132 |     elif name == "DPM++ 2M":
133 |         scheduler = DPMSolverMultistepScheduler()
134 |     elif name == "DPM++ 2M Karras":
135 |         scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True)
136 |     elif name == "DPM++ 2M SDE":
137 |         scheduler = DPMSolverMultistepScheduler(algorithm_type="sde-dpmsolver++")
138 |     elif name == "DPM++ 2M SDE Karras":
139 |         scheduler = DPMSolverMultistepScheduler(use_karras_sigmas=True, algorithm_type="sde-dpmsolver++")
140 |     elif name == "DPM++ SDE":
141 |         scheduler = DPMSolverSinglestepScheduler()
142 |     elif name == "DPM++ SDE Karras":
143 |         scheduler = DPMSolverSinglestepScheduler(use_karras_sigmas=True)
144 |     elif name == "DPM2":
145 |         scheduler = KDPM2DiscreteScheduler()
146 |     elif name == "DPM2 Karras":
147 |         scheduler = KDPM2DiscreteScheduler(use_karras_sigmas=True)
148 |     elif name == "DPM2 a":
149 |         scheduler = KDPM2AncestralDiscreteScheduler()
150 |     elif name == "DPM2 a Karras":
151 |         scheduler = KDPM2AncestralDiscreteScheduler(use_karras_sigmas=True)
152 |     elif name == "Heun":
153 |         scheduler = HeunDiscreteScheduler()
154 |     elif name == "LCM":
155 |         scheduler = LCMScheduler()
156 |     elif name == "LMS":
157 |         scheduler = LMSDiscreteScheduler()
158 |     elif name == "LMS Karras":
159 |         scheduler = LMSDiscreteScheduler(use_karras_sigmas=True)
160 |     elif name == "UniPC":
161 |         scheduler = UniPCMultistepScheduler()
162 |     return scheduler
163 | 
164 | 
165 | class HI_Diffusers_Model_Loader:
166 |     def __init__(self):
167 |         pass
168 | 
169 |     @classmethod
170 |     def INPUT_TYPES(cls):
171 |         return {
172 |             "required": {
173 |                 "function_choice": (["txt2img", "img2img", ],),
174 |                 "ckpt_name": (folder_paths.get_filename_list("checkpoints"),),
175 |                 "vae_id": (["none"] + folder_paths.get_filename_list("vae"),),
176 |                 "unet_model": (["none"] + folder_paths.get_filename_list("unet"),),
177 |                 "controlnet_model": (["none"] + folder_paths.get_filename_list("controlnet"),),
178 |                 "lora": (["none"] + folder_paths.get_filename_list("loras"),),
179 |                 "lora_scale": ("FLOAT", {"default": 0.8, "min": 0.1, "max": 1.0, "step": 0.1}),
180 |                 "trigger_words": ("STRING", {"default": "best quality"}),
181 |                 "scheduler": (scheduler_list,),
182 |                 "apply_window_attn":("BOOLEAN", {"default": False},),
183 |                 "ip_ckpt": (["none"] + folder_paths.get_filename_list("photomaker"),),
184 |                 "clip_vision": (["none"] + folder_paths.get_filename_list("clip_vision"),),
185 |                 
186 |             }
187 |         }
188 |      
189 |    
190 |     RETURN_TYPES = ("HIDIF_MODEL", )
191 |     RETURN_NAMES = ("pipe", )
192 |     FUNCTION = "loader_models"
193 |     CATEGORY = "Hidiffusion_Pro"
194 | 
195 |     def loader_models(self,function_choice, ckpt_name,vae_id,unet_model, controlnet_model,
196 |                       lora,lora_scale,trigger_words,scheduler,apply_window_attn,ip_ckpt,clip_vision):
197 |         
198 |         ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name) if ckpt_name!="none" else None
199 |         sd_type=""
200 |         if ckpt_path:
201 |             sd = load_single_file_checkpoint(ckpt_path)
202 |             try:
203 |                sd_type = infer_diffusers_model_type(sd)
204 |                del sd
205 |             except:
206 |                 raise "diffuser need >0.27.2"
207 |             
208 |     
209 |         vae_id=vae_id if vae_id!="none" else None
210 |         controlnet_path=folder_paths.get_full_path("controlnet", controlnet_model) if controlnet_model!="none" else None
211 |         unet_ckpt = folder_paths.get_full_path("unet", unet_model)  if unet_model!="none" else None
212 |         ip_ckpt = folder_paths.get_full_path("photomaker", ip_ckpt) if ip_ckpt != "none" else None
213 |         clip_vision = folder_paths.get_full_path("clip_vision", clip_vision) if clip_vision != "none" else None
214 |         
215 |         scheduler_used = get_sheduler(scheduler)
216 |         
217 |         if sd_type == "v1" or sd_type == "v2":
218 |             model_type="stable-diffusion-v1-5"
219 |             model_config=os.path.join(dir_path,"sd15_config")
220 |             original_config_file = os.path.join(folder_paths.models_dir, "configs", "v1-inference.yaml")
221 |             if dif_version_int >= 28:
222 |                 model = StableDiffusionPipeline.from_single_file(
223 |                     ckpt_path, config=model_config,original_config=original_config_file, torch_dtype=torch.float16).to("cuda")
224 |             else:
225 |                 model = StableDiffusionPipeline.from_single_file(
226 |                     ckpt_path,config=model_config, original_config_file=original_config_file, torch_dtype=torch.float16).to("cuda")
227 |     
228 |         elif sd_type =="playground-v2-5":
229 |             model_type = "playground-v2-1024px-aesthetic"
230 |             model_config ="playgroundai/playground-v2.5-1024px-aesthetic"
231 |             model = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=model_config,torch_dtype=torch.float16).to("cuda")
232 | 
233 |         elif sd_type == "xl_inpaint":
234 |             model_type ="stable-diffusion-xl-1.0-inpainting-0.1"
235 |             model_config = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
236 |             original_config_file = os.path.join(dir_path, "weights", "sd_xl_base.yaml")
237 |             if dif_version_int >= 28:
238 |                 model = StableDiffusionXLInpaintPipeline.from_single_file(ckpt_path,config=model_config,
239 |                                                                           original_config=original_config_file,
240 |                                                                           torch_dtype=torch.float16,
241 |                                                                         )
242 |             else:
243 |                 model = StableDiffusionXLInpaintPipeline.from_single_file(ckpt_path,config=model_config,
244 |                                                                           original_config_file=original_config_file,
245 |                                                                           torch_dtype=torch.float16,
246 |                                                                          )
247 |             if unet_model in sdxl_lightning_list:
248 |                 if unet_model.rsplit('.', 1)[-1] == "bin":
249 |                     model.unet.load_state_dict(torch.load(unet_ckpt),strict=False,)
250 |                 else:
251 |                     model.unet.load_state_dict(load_file(unet_ckpt), strict=False, )
252 |         elif sd_type == "xl_base":
253 |             model_type = "stable-diffusion-xl-base-1.0"
254 |             model_config=os.path.join(dir_path,"sdxl_config")
255 |             original_config_file = os.path.join(dir_path, "weights", "sd_xl_base.yaml")
256 |             
257 |             if dif_version_int >= 28:
258 |                 model = StableDiffusionXLPipeline.from_single_file(
259 |                     ckpt_path, config=model_config,original_config=original_config_file, torch_dtype=torch.float16)
260 |             else:
261 |                 model = StableDiffusionXLPipeline.from_single_file(
262 |                     ckpt_path,config=model_config, original_config_file=original_config_file, torch_dtype=torch.float16)
263 |                 
264 |             if controlnet_path:
265 |                 controlnet = ControlNetModel.from_unet(model.unet)
266 |                 cn_state_dict = load_file(controlnet_path)
267 |                 controlnet.load_state_dict(cn_state_dict, strict=False)
268 |                 controlnet.to(torch.float16)
269 |                 if function_choice == "img2img":
270 |                     model = StableDiffusionXLControlNetImg2ImgPipeline.from_pipe(model,controlnet=controlnet)
271 |                 else:
272 |                     model = StableDiffusionXLControlNetPipeline.from_pipe(model,controlnet=controlnet)
273 |                     
274 |             if unet_model in sdxl_lightning_list:
275 |                 if unet_model.rsplit('.', 1)[-1] == "bin":
276 |                     model.unet.load_state_dict(torch.load(unet_ckpt), strict=False,)
277 |                 else:
278 |                     model.unet.load_state_dict(load_file(unet_ckpt), strict=False, )
279 |         else:
280 |             raise "unsupport model!!"
281 |         if vae_id:
282 |             vae_id = folder_paths.get_full_path("vae", vae_id)
283 |             if sd_type == "xl_base" or sd_type == "xl_inpaint":
284 |                 vae_config=os.path.join(dir_path,"sdxl_config","vae")
285 |             elif sd_type == "v1" or sd_type == "v2" :
286 |                 vae_config=os.path.join(dir_path, "sd15_config","vae")
287 |             elif sd_type == "playground-v2-5" :
288 |                 vae_config=os.path.join(dir_path,"weights/playground")
289 |             else:
290 |                 raise "vae not support"
291 |             model.vae = AutoencoderKL.from_single_file(vae_id,config=vae_config, torch_dtype=torch.float16).to("cuda")
292 |         if  sd_type == "xl_inpaint":
293 |             model.scheduler =scheduler_used.from_pretrained(os.path.join(dir_path,"sdxl_config"), subfolder="scheduler")
294 |         else:
295 |             model.scheduler = scheduler_used.from_config(model.scheduler.config, timestep_spacing="trailing")
296 |         
297 |         if lora!="none":
298 |             lora_path = folder_paths.get_full_path("loras", lora)
299 |             model.load_lora_weights(lora_path, adapter_name=trigger_words)
300 |             model.fuse_lora(lora_scale=lora_scale, adapter_names=[trigger_words,])
301 |        
302 |         model.enable_xformers_memory_efficient_attention()
303 |         model.enable_vae_tiling()
304 |         apply_hidiffusion(model,apply_window_attn=apply_window_attn,model_type_str=model_type)
305 |         model.enable_model_cpu_offload()  # need below apply_hidiffusion(model)
306 |         ip_adapter = False
307 |         if ip_ckpt is not None and clip_vision is not None:
308 |             model.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
309 |             device = "cuda"
310 |             remove_hidiffusion(model)
311 |             image_encoder = load_clip(clip_vision)
312 |             if sd_type == "xl_base":
313 |                 config_path=os.path.join(dir_path,"weights","sdxl","config.json")
314 |                 image_encoder_config = OmegaConf.load(config_path)
315 |                 model = IPAdapterXL(model, image_encoder, ip_ckpt, device,image_encoder_config,
316 |                                 target_blocks=["up_blocks.0.attentions.1"])
317 |             elif sd_type == "v1":
318 |                 config_path = os.path.join(dir_path, "weights", "sd15","config.json")
319 |                 image_encoder_config = OmegaConf.load(config_path)
320 |                 model = IPAdapter(model, image_encoder, ip_ckpt, device,image_encoder_config, target_blocks=["block"])
321 |             else:
322 |                 raise "unsupport model,only support SDXL or SD1.5"
323 |             torch.cuda.empty_cache()
324 |             ip_adapter=True
325 |             
326 |         torch.cuda.empty_cache()
327 |         pipe={"model":model,"controlnet_path":controlnet_path,"sd_type":sd_type,"lora":lora,"trigger_words":trigger_words,"ip_adapter":ip_adapter,"function_choice":function_choice}
328 |         torch.cuda.empty_cache()
329 |         return (pipe,)
330 |     
331 | 
332 | class Hi_Sampler:
333 |     def __init__(self):
334 |         pass
335 | 
336 |     @classmethod
337 |     def INPUT_TYPES(cls):
338 |         return {
339 |             "required": {
340 |                 "pipe": ("HIDIF_MODEL",),
341 |                 "prompt": ("STRING", {"multiline": True,
342 |                                       "default": "a girl,8k,smile,best quality"}),
343 |                 "negative_prompt": ("STRING", {"multiline": True,
344 |                                                "default": "text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"}),
345 |                 "controlnet_scale": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 1.0, "step": 0.1}),
346 |                 "clip_skip": ("INT", {"default": 1, "min": -5, "max": 100,"step": 1}),
347 |                 "pre_input": ("INT", {"default": 512, "min": 256, "max": 1024, "step": 64}),
348 |                 "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
349 |                 "steps": ("INT", {"default": 30, "min": 1, "max": 10000}),
350 |                 "cfg": ("FLOAT", {"default": 7.5, "min": 0.0, "max": 100.0, "step": 0.1, "round": 0.01}),
351 |                 "width": ("INT", {"default": 2048, "min": 64, "max": 8192, "step": 64, "display": "number"}),
352 |                 "height": ("INT", {"default": 2048, "min": 64, "max": 8192, "step": 64, "display": "number"}),
353 |                 "adapter_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.1,}),
354 |             },
355 |             "optional": {"image": ("IMAGE",),
356 |                 "control_image": ("IMAGE",),
357 |                 "ip_image": ("IMAGE",)}
358 |         }
359 | 
360 |     RETURN_TYPES = ("IMAGE",)
361 |     RETURN_NAMES = ("image",)
362 |     FUNCTION = "hi_sampler"
363 |     CATEGORY = "Hidiffusion_Pro"
364 | 
365 | 
366 |     def hi_sampler(self, pipe, prompt, negative_prompt,controlnet_scale,clip_skip,pre_input,
367 |                    seed,steps, cfg, width,height,adapter_scale,**kwargs):
368 |         model=pipe.get("model",None)
369 |         controlnet_path = pipe["controlnet_path"]
370 |         sd_type = pipe["sd_type"]
371 |         lora = pipe["lora"]
372 |         trigger_words = pipe["trigger_words"]
373 |         ip_adapter = pipe["ip_adapter"]
374 |         function_choice =pipe["function_choice"]
375 |         
376 |         if ip_adapter:
377 |             ip_image = kwargs.get("ip_image")
378 |             #ip_image = input_size_adaptation_output(ip_image, pre_input, width, height)
379 |             if lora != "none":
380 |                 prompt = prompt + " " + trigger_words
381 |             if  controlnet_path is None:
382 |                 if function_choice == "img2img":
383 |                     image = kwargs.get("image")
384 |                     image = input_size_adaptation_output(image, pre_input, width, height)
385 |                     images = \
386 |                         model.generate(prompt=prompt, negative_prompt=negative_prompt,pil_image=ip_image, image=image, scale=adapter_scale,num_inference_steps=steps,
387 |                               guidance_scale=cfg, clip_skip=clip_skip,
388 |                               height=height, width=width, seed=seed, )
389 |                 else:
390 |                     images = \
391 |                         model.generate(pil_image=ip_image,prompt=prompt, negative_prompt=negative_prompt,scale=adapter_scale, num_inference_steps=steps,
392 |                               guidance_scale=cfg, clip_skip=clip_skip,
393 |                               height=height, width=width, seed=seed, )
394 |     
395 |             else:
396 |                 control_image = kwargs.get("control_image")
397 |                 if "tile" in controlnet_path:
398 |                     control_image = input_size_adaptation_output(control_image, pre_input, width, height)
399 |                     controlnet_img = cv2.cvtColor(np.asarray(control_image), cv2.COLOR_RGB2BGR)
400 |                     new_height, new_width, _ = controlnet_img.shape
401 |                     ratio = np.sqrt(1024. * 1024. / (new_width * new_height))
402 |                     W, H = int(new_width * ratio), int(new_height * ratio)
403 |                     
404 |                     crop_w, crop_h = 0, 0
405 |                     controlnet_img = cv2.resize(controlnet_img, (W, H))
406 |                     
407 |                     blur_strength = random.sample([i / 10. for i in range(10, 201, 2)], k=1)[0]
408 |                     radius = random.sample([i for i in range(1, 40, 2)], k=1)[0]
409 |                     eps = random.sample([i / 1000. for i in range(1, 101, 2)], k=1)[0]
410 |                     scale_factor = random.sample([i / 10. for i in range(10, 181, 5)], k=1)[0]
411 |                     
412 |                     if random.random() > 0.5:
413 |                         controlnet_img = apply_gaussian_blur(controlnet_img, ksize=int(blur_strength),
414 |                                                              sigmaX=blur_strength / 2)
415 |                     
416 |                     if random.random() > 0.5:
417 |                         # Apply Guided Filter
418 |                         controlnet_img = apply_guided_filter(controlnet_img, radius, eps, scale_factor)
419 |                     
420 |                     # Resize image
421 |                     controlnet_img = cv2.resize(controlnet_img, (int(W / scale_factor), int(H / scale_factor)),
422 |                                                 interpolation=cv2.INTER_AREA)
423 |                     controlnet_img = cv2.resize(controlnet_img, (W, H), interpolation=cv2.INTER_CUBIC)
424 |                     
425 |                     controlnet_img = cv2.cvtColor(controlnet_img, cv2.COLOR_BGR2RGB)
426 |                     control_image = Image.fromarray(controlnet_img)
427 |                 else:
428 |                     control_image = input_size_adaptation_output(control_image, pre_input, width, height)
429 |                 if function_choice == "img2img":
430 |                     image = kwargs["image"]
431 |                     image = input_size_adaptation_output(image, pre_input, width, height)
432 |                     if sd_type == "xl_inpaint":
433 |                         images = model.generate(prompt=prompt, negative_prompt=negative_prompt, image=image,pil_image=ip_image, scale=adapter_scale,mask_image=control_image,
434 |                                   num_inference_steps=steps, guidance_scale=cfg, height=height, clip_skip=clip_skip,
435 |                                   width=width, controlnet_conditioning_scale=controlnet_scale,
436 |                                   seed=seed, )
437 |                     else:
438 |                         images =  model.generate(prompt=prompt, negative_prompt=negative_prompt, image=image, pil_image=ip_image,scale=adapter_scale,control_image=control_image,
439 |                                       num_inference_steps=steps, guidance_scale=cfg, height=height, width=width,
440 |                                       clip_skip=clip_skip,
441 |                                       controlnet_conditioning_scale=controlnet_scale,
442 |                                       seed=seed, )
443 |                 else:
444 |                     images = model.generate(prompt=prompt, negative_prompt=negative_prompt, pil_image=ip_image,scale=adapter_scale,control_image=control_image,
445 |                                   num_inference_steps=steps, guidance_scale=cfg, height=height, width=width,
446 |                                   clip_skip=clip_skip,
447 |                                   controlnet_conditioning_scale=controlnet_scale,
448 |                                   seed=seed, )
449 |             images = images[0]
450 |         else:
451 |             if lora != "none":
452 |                 prompt = prompt + " " + trigger_words
453 |             # print(model_type, unet_model, control_net, function_choice)
454 |             if controlnet_path is None:
455 |                 if function_choice == "img2img":
456 |                     image = kwargs["image"]
457 |                     image = input_size_adaptation_output(image, pre_input, width, height)
458 |                     images = \
459 |                         model(prompt, negative_prompt=negative_prompt, image=image, num_inference_steps=steps,
460 |                               guidance_scale=cfg, clip_skip=clip_skip,
461 |                               height=height, width=width, seed=seed, ).images[0]
462 |                 else:
463 |                     images = \
464 |                         model(prompt, negative_prompt=negative_prompt, num_inference_steps=steps,
465 |                               guidance_scale=cfg, clip_skip=clip_skip,
466 |                               height=height, width=width, seed=seed, ).images[0]
467 |             else:
468 |                 control_image = kwargs["control_image"]
469 |                 if "tile" in controlnet_path:
470 |                     control_image = input_size_adaptation_output(control_image, pre_input, width, height)
471 |                     controlnet_img = cv2.cvtColor(np.asarray(control_image), cv2.COLOR_RGB2BGR)
472 |                     new_height, new_width, _ = controlnet_img.shape
473 |                     ratio = np.sqrt(1024. * 1024. / (new_width * new_height))
474 |                     W, H = int(new_width * ratio), int(new_height * ratio)
475 |                     
476 |                     crop_w, crop_h = 0, 0
477 |                     controlnet_img = cv2.resize(controlnet_img, (W, H))
478 |                     
479 |                     blur_strength = random.sample([i / 10. for i in range(10, 201, 2)], k=1)[0]
480 |                     radius = random.sample([i for i in range(1, 40, 2)], k=1)[0]
481 |                     eps = random.sample([i / 1000. for i in range(1, 101, 2)], k=1)[0]
482 |                     scale_factor = random.sample([i / 10. for i in range(10, 181, 5)], k=1)[0]
483 |                     
484 |                     if random.random() > 0.5:
485 |                         controlnet_img = apply_gaussian_blur(controlnet_img, ksize=int(blur_strength),
486 |                                                              sigmaX=blur_strength / 2)
487 |                     
488 |                     if random.random() > 0.5:
489 |                         # Apply Guided Filter
490 |                         controlnet_img = apply_guided_filter(controlnet_img, radius, eps, scale_factor)
491 |                     
492 |                     # Resize image
493 |                     controlnet_img = cv2.resize(controlnet_img, (int(W / scale_factor), int(H / scale_factor)),
494 |                                                 interpolation=cv2.INTER_AREA)
495 |                     controlnet_img = cv2.resize(controlnet_img, (W, H), interpolation=cv2.INTER_CUBIC)
496 |                     
497 |                     controlnet_img = cv2.cvtColor(controlnet_img, cv2.COLOR_BGR2RGB)
498 |                     control_image = Image.fromarray(controlnet_img)
499 |                 else:
500 |                     control_image = input_size_adaptation_output(control_image, pre_input, width, height)
501 |                 
502 |                 if function_choice == "img2img":
503 |                     image = kwargs["image"]
504 |                     image = input_size_adaptation_output(image, pre_input, width, height)
505 |                     if sd_type == "xl_inpaint":
506 |                         print("controlnet inpainting")
507 |                         images = \
508 |                             model(prompt, negative_prompt=negative_prompt, image=image, mask_image=control_image,
509 |                                   num_inference_steps=steps, guidance_scale=cfg, height=height, clip_skip=clip_skip,
510 |                                   width=width, controlnet_conditioning_scale=controlnet_scale,
511 |                                   seed=seed, ).images[0]
512 |                     else:
513 |                         print("controlnet img2img")
514 |                         images = model(prompt, negative_prompt=negative_prompt, image=image, control_image=control_image,
515 |                                       num_inference_steps=steps, guidance_scale=cfg, height=height, width=width,
516 |                                       clip_skip=clip_skip,
517 |                                       controlnet_conditioning_scale=controlnet_scale,
518 |                                       seed=seed, ).images[0]
519 |                 else:
520 |                     print("controlnet txt2img")
521 |                     images = model(prompt,control_image=control_image, negative_prompt=negative_prompt,
522 |                                   num_inference_steps=steps, guidance_scale=cfg, height=height, width=width,
523 |                                   clip_skip=clip_skip,
524 |                                   controlnet_conditioning_scale=controlnet_scale,
525 |                                   seed=seed, ).images[0]
526 |         
527 | 
528 |         output_image = torch.from_numpy(np.array(images).astype(np.float32) / 255.0).unsqueeze(0)
529 |         if lora != "none":
530 |             if ip_adapter is None:
531 |                 model.unfuse_lora()
532 |         torch.cuda.empty_cache()
533 |         return (output_image,)
534 | 
535 | 
536 | NODE_CLASS_MAPPINGS = {
537 |     "HI_Diffusers_Model_Loader": HI_Diffusers_Model_Loader,
538 |     "Hi_Sampler": Hi_Sampler
539 | }
540 | 
541 | NODE_DISPLAY_NAME_MAPPINGS = {
542 |     "HI_Diffusers_Model_Loader": "HI_Diffusers_Model_Loader",
543 |     "Hi_Hi_Sampler": "Hi_Sampler"
544 | }
545 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ComfyUI_HiDiffusion_Pro
  2 | A  HiDiffusion node for ComfyUI
  3 | 
  4 | HiDiffusion  From: [link](https://github.com/megvii-research/HiDiffusion)  
  5 | ----
  6 | 
  7 | Update 
  8 | ----
  9 | 
 10 | **09/08**   
 11 | * adapter style now using  single file.
 12 | * adapter style改成单体文件模式
 13 |  
 14 | 
 15 | **Previous updates**
 16 | *修复runwaybug / 去掉repo加载模型的方式  /自动选择模型的类别  
 17 | * 增加adapter_style支持，SDXL需求的显存较大，虽然能跑CPU，但是不推荐，会爆显存，SD1.5测试没问题。  
 18 | * 增加 manne加速Lora  
 19 | * 加入controlnet-tile-sdxl的支持，内置图片预处理，默认512尺寸，新增apply_window_attn 条件控制。  
 20 | * 修复节点连接逻辑，现在文生图模式，无需接入image，无controlnet也无需接入control_image   
 21 | * 支持SDXL-lighting\Hyper\LCM\DMD2\的加速Unet，建议适当提高步数；    
 22 | * 基于官方的更新，加入lora支持，需要填关键词；    
 23 | * 加入skip，去掉意义不大的其他参数；     
 24 |      
 25 |  *fix runway error/del repo /auto choice model type             
 26 | * Adding adapter_style support, SDXL requires a large amount of graphics memory. Although it can run on CPU, it is not recommended as it may cause graphics memory " explosion". SD1.5 testing is not a problem.   
 27 | * add manne lighting lora  
 28 | * Added support for control net file sdxl, built-in image preprocessing, default size of 512, and added condition control for apply_window_attn.   
 29 | * Fix node connection logic, now in text-based graphics mode, there is no need to connect to image, no controllnet, and no need to connect to controll_image   
 30 | * Support acceleration Unet for SDXL lighting, Hyper, LCM, and DMD2. It is recommended to increase the number of steps appropriately;   
 31 | * Based on official updates, adding support for Lora requires filling in keywords;   
 32 | * Add skip and remove other parameters that are not significant;   
 33 | 
 34 | 
 35 | 1.Installation
 36 | -----
 37 |   1.1 In the .\ComfyUI \ custom_node directory, run the following:   
 38 |   
 39 |   ``` python 
 40 |   git clone https://github.com/smthemex/ComfyUI_HiDiffusion_Pro.git   
 41 |   ```
 42 |   1.2 using it
 43 |   
 44 | 2.requirements  
 45 | ----
 46 | diffusers >=0.28.0   #is best 
 47 | yaml
 48 | 
 49 | 3 About models     
 50 | ----
 51 | 3.1 base ckpt  
 52 | ```
 53 | ├──comfyUI/models/checkpoints/   
 54 | |     ├──sd1.5 or sd2.1 or sdxl  or playground   
 55 | ├──comfyUI/models/vae/   
 56 | |     ├──any vae fit ckpt 
 57 | ```
 58 | 3.2 if using SDXL controlnet     
 59 | ```
 60 | ├──comfyUI/models/controlnet/   
 61 | |     ├──any SDXL controlnet
 62 | ```
 63 | 3.3 if using lighting  Unet   
 64 | ```
 65 | ├──comfyUI/models/unet/   
 66 | |     ├──any SDXL lighting  Unet 
 67 | ```
 68 | 3.4 if using adapter style   
 69 | 
 70 | ```
 71 | ├── ComfyUI/models/photomaker
 72 | |         ├── ip-adapter_sd15.bin
 73 | |         ├── ip-adapter_sdxl.bin
 74 | ├── ComfyUI/models/clip_vision
 75 | |             ├── sdxl_model.safetensors  # rename from   sd15/ncoder/model.safetensors
 76 | |             ├── sdm_model.safetensors  # rename from  sdxl/encoder/model.safetensors
 77 | 
 78 | ```
 79 | 
 80 | 4 example
 81 | -----
 82 | 
 83 | new workflow example   new
 84 |  ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/new.png)
 85 | 
 86 |  sd1.5 using ip_adapter_style  使用ip_adapter_style  
 87 |  ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/sd15ipstyle1.png)
 88 | 
 89 | img2img use  lora     图生图和lora   
 90 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/img2img_lora1.png)
 91 | 
 92 | 
 93 | img2img + controlnet  图生图加controlnet      
 94 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/controlnet_img2img1.png)
 95 | 
 96 | img2img  use Hyper unet   图生图加加速unet   
 97 | ![](https://github.com/smthemex/ComfyUI_HiDiffusion_Pro/blob/main/example/lightingUnet1.png)
 98 | 
 99 | 6 Citation
100 | ------
101 | 
102 | ``` python  
103 | @article{zhang2023hidiffusion,
104 |   title={HiDiffusion: Unlocking Higher-Resolution Creativity and Efficiency in Pretrained Diffusion Models},
105 |   author={Zhang, Shen and Chen, Zhaowei and Zhao, Zhenyu and Chen, Yuhao and Tang, Yao and Liang, Jiajun},
106 |   journal={arXiv preprint arXiv:2311.17528},
107 |   year={2023}
108 | }
109 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | python = sys.executable
4 | 
5 | from .Hidiffusion_node import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
6 | 
7 | 
8 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
9 | 


--------------------------------------------------------------------------------
/example/controlnet_img2img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/controlnet_img2img1.png


--------------------------------------------------------------------------------
/example/img2img_lora1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/img2img_lora1.png


--------------------------------------------------------------------------------
/example/lightingUnet1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/lightingUnet1.png


--------------------------------------------------------------------------------
/example/new.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 9,
  3 |   "last_link_id": 10,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 3,
  7 |       "type": "SaveImage",
  8 |       "pos": [
  9 |         4311,
 10 |         -264
 11 |       ],
 12 |       "size": {
 13 |         "0": 367.2918701171875,
 14 |         "1": 381.46820068359375
 15 |       },
 16 |       "flags": {},
 17 |       "order": 3,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "images",
 22 |           "type": "IMAGE",
 23 |           "link": 9,
 24 |           "label": "images"
 25 |         }
 26 |       ],
 27 |       "properties": {},
 28 |       "widgets_values": [
 29 |         "ComfyUI"
 30 |       ]
 31 |     },
 32 |     {
 33 |       "id": 6,
 34 |       "type": "LoadImage",
 35 |       "pos": [
 36 |         3473,
 37 |         10
 38 |       ],
 39 |       "size": {
 40 |         "0": 315,
 41 |         "1": 314
 42 |       },
 43 |       "flags": {},
 44 |       "order": 0,
 45 |       "mode": 0,
 46 |       "outputs": [
 47 |         {
 48 |           "name": "IMAGE",
 49 |           "type": "IMAGE",
 50 |           "links": [
 51 |             10
 52 |           ],
 53 |           "shape": 3,
 54 |           "label": "IMAGE",
 55 |           "slot_index": 0
 56 |         },
 57 |         {
 58 |           "name": "MASK",
 59 |           "type": "MASK",
 60 |           "links": null,
 61 |           "shape": 3,
 62 |           "label": "MASK"
 63 |         }
 64 |       ],
 65 |       "properties": {
 66 |         "Node name for S&R": "LoadImage"
 67 |       },
 68 |       "widgets_values": [
 69 |         "4.jpg",
 70 |         "image"
 71 |       ]
 72 |     },
 73 |     {
 74 |       "id": 8,
 75 |       "type": "HI_Diffusers_Model_Loader",
 76 |       "pos": [
 77 |         3472,
 78 |         -365
 79 |       ],
 80 |       "size": {
 81 |         "0": 315,
 82 |         "1": 322
 83 |       },
 84 |       "flags": {},
 85 |       "order": 1,
 86 |       "mode": 0,
 87 |       "outputs": [
 88 |         {
 89 |           "name": "pipe",
 90 |           "type": "HIDIF_MODEL",
 91 |           "links": [
 92 |             7
 93 |           ],
 94 |           "slot_index": 0,
 95 |           "shape": 3,
 96 |           "label": "pipe"
 97 |         }
 98 |       ],
 99 |       "properties": {
100 |         "Node name for S&R": "HI_Diffusers_Model_Loader"
101 |       },
102 |       "widgets_values": [
103 |         "img2img",
104 |         "0SDXL\\juggernautXL_v8Rundiffusion.safetensors",
105 |         "none",
106 |         "sdxl_lightning_4step_unet.safetensors",
107 |         "none",
108 |         "none",
109 |         0.8,
110 |         "best quality",
111 |         "Euler",
112 |         false,
113 |         "none",
114 |         "none"
115 |       ]
116 |     },
117 |     {
118 |       "id": 9,
119 |       "type": "Hi_Sampler",
120 |       "pos": [
121 |         3875,
122 |         -292
123 |       ],
124 |       "size": {
125 |         "0": 400,
126 |         "1": 426
127 |       },
128 |       "flags": {},
129 |       "order": 2,
130 |       "mode": 0,
131 |       "inputs": [
132 |         {
133 |           "name": "pipe",
134 |           "type": "HIDIF_MODEL",
135 |           "link": 7,
136 |           "label": "pipe"
137 |         },
138 |         {
139 |           "name": "image",
140 |           "type": "IMAGE",
141 |           "link": 10,
142 |           "label": "image"
143 |         },
144 |         {
145 |           "name": "control_image",
146 |           "type": "IMAGE",
147 |           "link": null,
148 |           "label": "control_image"
149 |         },
150 |         {
151 |           "name": "ip_image",
152 |           "type": "IMAGE",
153 |           "link": null,
154 |           "label": "ip_image"
155 |         }
156 |       ],
157 |       "outputs": [
158 |         {
159 |           "name": "image",
160 |           "type": "IMAGE",
161 |           "links": [
162 |             9
163 |           ],
164 |           "slot_index": 0,
165 |           "shape": 3,
166 |           "label": "image"
167 |         }
168 |       ],
169 |       "properties": {
170 |         "Node name for S&R": "Hi_Sampler"
171 |       },
172 |       "widgets_values": [
173 |         "a girl,8k,smile,best quality",
174 |         "text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
175 |         0.5,
176 |         1,
177 |         512,
178 |         1108821181920656,
179 |         "randomize",
180 |         12,
181 |         3,
182 |         2048,
183 |         2048,
184 |         1
185 |       ]
186 |     }
187 |   ],
188 |   "links": [
189 |     [
190 |       7,
191 |       8,
192 |       0,
193 |       9,
194 |       0,
195 |       "HIDIF_MODEL"
196 |     ],
197 |     [
198 |       9,
199 |       9,
200 |       0,
201 |       3,
202 |       0,
203 |       "IMAGE"
204 |     ],
205 |     [
206 |       10,
207 |       6,
208 |       0,
209 |       9,
210 |       1,
211 |       "IMAGE"
212 |     ]
213 |   ],
214 |   "groups": [],
215 |   "config": {},
216 |   "extra": {
217 |     "ds": {
218 |       "scale": 1.0610764609500176,
219 |       "offset": [
220 |         -3283.5070365830798,
221 |         461.2881609587324
222 |       ]
223 |     }
224 |   },
225 |   "version": 0.4
226 | }


--------------------------------------------------------------------------------
/example/new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/new.png


--------------------------------------------------------------------------------
/example/sd15ipstyle1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smthemex/ComfyUI_HiDiffusion_Pro/ef7e9ed7594d79d64f9031b9fe132908f512133a/example/sd15ipstyle1.png


--------------------------------------------------------------------------------
/guided_filter.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- coding: utf-8 -*-
  3 | ## @package guided_filter.core.filters
  4 | #
  5 | #  Implementation of guided filter.
  6 | #  * GuidedFilter: Original guided filter.
  7 | #  * FastGuidedFilter: Fast version of the guided filter.
  8 | #  @author      tody
  9 | #  @date        2015/08/26
 10 | 
 11 | import numpy as np
 12 | import cv2
 13 | 
 14 | ## Convert image into float32 type.
 15 | def to32F(img):
 16 |     if img.dtype == np.float32:
 17 |         return img
 18 |     return (1.0 / 255.0) * np.float32(img)
 19 | 
 20 | ## Convert image into uint8 type.
 21 | def to8U(img):
 22 |     if img.dtype == np.uint8:
 23 |         return img
 24 |     return np.clip(np.uint8(255.0 * img), 0, 255)
 25 | 
 26 | ## Return if the input image is gray or not.
 27 | def _isGray(I):
 28 |     return len(I.shape) == 2
 29 | 
 30 | 
 31 | ## Return down sampled image.
 32 | #  @param scale (w/s, h/s) image will be created.
 33 | #  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
 34 | def _downSample(I, scale=4, shape=None):
 35 |     if shape is not None:
 36 |         h, w = shape
 37 |         return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST)
 38 | 
 39 |     h, w = I.shape[:2]
 40 |     return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST)
 41 | 
 42 | 
 43 | ## Return up sampled image.
 44 | #  @param scale (w*s, h*s) image will be created.
 45 | #  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
 46 | def _upSample(I, scale=2, shape=None):
 47 |     if shape is not None:
 48 |         h, w = shape
 49 |         return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR)
 50 | 
 51 |     h, w = I.shape[:2]
 52 |     return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
 53 | 
 54 | ## Fast guide filter.
 55 | class FastGuidedFilter:
 56 |     ## Constructor.
 57 |     #  @param I Input guidance image. Color or gray.
 58 |     #  @param radius Radius of Guided Filter.
 59 |     #  @param epsilon Regularization term of Guided Filter.
 60 |     #  @param scale Down sampled scale.
 61 |     def __init__(self, I, radius=5, epsilon=0.4, scale=4):
 62 |         I_32F = to32F(I)
 63 |         self._I = I_32F
 64 |         h, w = I.shape[:2]
 65 | 
 66 |         I_sub = _downSample(I_32F, scale)
 67 | 
 68 |         self._I_sub = I_sub
 69 |         radius = int(radius / scale)
 70 | 
 71 |         if _isGray(I):
 72 |             self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon)
 73 |         else:
 74 |             self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon)
 75 | 
 76 |     ## Apply filter for the input image.
 77 |     #  @param p Input image for the filtering.
 78 |     def filter(self, p):
 79 |         p_32F = to32F(p)
 80 |         shape_original = p.shape[:2]
 81 | 
 82 |         p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2])
 83 | 
 84 |         if _isGray(p_sub):
 85 |             return self._filterGray(p_sub, shape_original)
 86 | 
 87 |         cs = p.shape[2]
 88 |         q = np.array(p_32F)
 89 | 
 90 |         for ci in range(cs):
 91 |             q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original)
 92 |         return to8U(q)
 93 | 
 94 |     def _filterGray(self, p_sub, shape_original):
 95 |         ab_sub = self._guided_filter._computeCoefficients(p_sub)
 96 |         ab = [_upSample(abi, shape=shape_original) for abi in ab_sub]
 97 |         return self._guided_filter._computeOutput(ab, self._I)
 98 | 
 99 | 
100 | ## Guide filter.
101 | class GuidedFilter:
102 |     ## Constructor.
103 |     #  @param I Input guidance image. Color or gray.
104 |     #  @param radius Radius of Guided Filter.
105 |     #  @param epsilon Regularization term of Guided Filter.
106 |     def __init__(self, I, radius=5, epsilon=0.4):
107 |         I_32F = to32F(I)
108 | 
109 |         if _isGray(I):
110 |             self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon)
111 |         else:
112 |             self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon)
113 | 
114 |     ## Apply filter for the input image.
115 |     #  @param p Input image for the filtering.
116 |     def filter(self, p):
117 |         return to8U(self._guided_filter.filter(p))
118 | 
119 | 
120 | ## Common parts of guided filter.
121 | #
122 | #  This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor.
123 | #  Based on guided_filter._computeCoefficients, guided_filter._computeOutput,
124 | #  GuidedFilterCommon.filter computes filtered image for color and gray.
125 | class GuidedFilterCommon:
126 |     def __init__(self, guided_filter):
127 |         self._guided_filter = guided_filter
128 | 
129 |     ## Apply filter for the input image.
130 |     #  @param p Input image for the filtering.
131 |     def filter(self, p):
132 |         p_32F = to32F(p)
133 |         if _isGray(p_32F):
134 |             return self._filterGray(p_32F)
135 | 
136 |         cs = p.shape[2]
137 |         q = np.array(p_32F)
138 | 
139 |         for ci in range(cs):
140 |             q[:, :, ci] = self._filterGray(p_32F[:, :, ci])
141 |         return q
142 | 
143 |     def _filterGray(self, p):
144 |         ab = self._guided_filter._computeCoefficients(p)
145 |         return self._guided_filter._computeOutput(ab, self._guided_filter._I)
146 | 
147 | 
148 | ## Guided filter for gray guidance image.
149 | class GuidedFilterGray:
150 |     #  @param I Input gray guidance image.
151 |     #  @param radius Radius of Guided Filter.
152 |     #  @param epsilon Regularization term of Guided Filter.
153 |     def __init__(self, I, radius=5, epsilon=0.4):
154 |         self._radius = 2 * radius + 1
155 |         self._epsilon = epsilon
156 |         self._I = to32F(I)
157 |         self._initFilter()
158 |         self._filter_common = GuidedFilterCommon(self)
159 | 
160 |     ## Apply filter for the input image.
161 |     #  @param p Input image for the filtering.
162 |     def filter(self, p):
163 |         return self._filter_common.filter(p)
164 | 
165 |     def _initFilter(self):
166 |         I = self._I
167 |         r = self._radius
168 |         self._I_mean = cv2.blur(I, (r, r))
169 |         I_mean_sq = cv2.blur(I ** 2, (r, r))
170 |         self._I_var = I_mean_sq - self._I_mean ** 2
171 | 
172 |     def _computeCoefficients(self, p):
173 |         r = self._radius
174 |         p_mean = cv2.blur(p, (r, r))
175 |         p_cov = p_mean - self._I_mean * p_mean
176 |         a = p_cov / (self._I_var + self._epsilon)
177 |         b = p_mean - a * self._I_mean
178 |         a_mean = cv2.blur(a, (r, r))
179 |         b_mean = cv2.blur(b, (r, r))
180 |         return a_mean, b_mean
181 | 
182 |     def _computeOutput(self, ab, I):
183 |         a_mean, b_mean = ab
184 |         return a_mean * I + b_mean
185 | 
186 | 
187 | ## Guided filter for color guidance image.
188 | class GuidedFilterColor:
189 |     #  @param I Input color guidance image.
190 |    #  @param radius Radius of Guided Filter.
191 |     #  @param epsilon Regularization term of Guided Filter.
192 |     def __init__(self, I, radius=5, epsilon=0.2):
193 |         self._radius = 2 * radius + 1
194 |         self._epsilon = epsilon
195 |         self._I = to32F(I)
196 |         self._initFilter()
197 |         self._filter_common = GuidedFilterCommon(self)
198 | 
199 |     ## Apply filter for the input image.
200 |     #  @param p Input image for the filtering.
201 |     def filter(self, p):
202 |         return self._filter_common.filter(p)
203 | 
204 |     def _initFilter(self):
205 |         I = self._I
206 |         r = self._radius
207 |         eps = self._epsilon
208 | 
209 |         Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
210 | 
211 |         self._Ir_mean = cv2.blur(Ir, (r, r))
212 |         self._Ig_mean = cv2.blur(Ig, (r, r))
213 |         self._Ib_mean = cv2.blur(Ib, (r, r))
214 | 
215 |         Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps
216 |         Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean
217 |         Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean
218 |         Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps
219 |         Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean
220 |         Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps
221 | 
222 |         Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var
223 |         Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var
224 |         Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var
225 |         Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var
226 |         Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var
227 |         Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var
228 | 
229 |         I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var
230 |         Irr_inv /= I_cov
231 |         Irg_inv /= I_cov
232 |         Irb_inv /= I_cov
233 |         Igg_inv /= I_cov
234 |         Igb_inv /= I_cov
235 |         Ibb_inv /= I_cov
236 | 
237 |         self._Irr_inv = Irr_inv
238 |         self._Irg_inv = Irg_inv
239 |         self._Irb_inv = Irb_inv
240 |         self._Igg_inv = Igg_inv
241 |         self._Igb_inv = Igb_inv
242 |         self._Ibb_inv = Ibb_inv
243 | 
244 |     def _computeCoefficients(self, p):
245 |         r = self._radius
246 |         I = self._I
247 |         Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
248 | 
249 |         p_mean = cv2.blur(p, (r, r))
250 | 
251 |         Ipr_mean = cv2.blur(Ir * p, (r, r))
252 |         Ipg_mean = cv2.blur(Ig * p, (r, r))
253 |         Ipb_mean = cv2.blur(Ib * p, (r, r))
254 | 
255 |         Ipr_cov = Ipr_mean - self._Ir_mean * p_mean
256 |         Ipg_cov = Ipg_mean - self._Ig_mean * p_mean
257 |         Ipb_cov = Ipb_mean - self._Ib_mean * p_mean
258 | 
259 |         ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov
260 |         ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov
261 |         ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov
262 |         b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean
263 | 
264 |         ar_mean = cv2.blur(ar, (r, r))
265 |         ag_mean = cv2.blur(ag, (r, r))
266 |         ab_mean = cv2.blur(ab, (r, r))
267 |         b_mean = cv2.blur(b, (r, r))
268 | 
269 |         return ar_mean, ag_mean, ab_mean, b_mean
270 | 
271 |     def _computeOutput(self, ab, I):
272 |         ar_mean, ag_mean, ab_mean, b_mean = ab
273 | 
274 |         Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
275 | 
276 |         q = (ar_mean * Ir +
277 |              ag_mean * Ig +
278 |              ab_mean * Ib +
279 |              b_mean)
280 | 
281 |         return q


--------------------------------------------------------------------------------
/hidiffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from .hidiffusion import apply_hidiffusion, remove_hidiffusion
2 | 
3 | __all__ = ["apply_hidiffusion", "remove_hidiffusion"]
4 | 


--------------------------------------------------------------------------------
/hidiffusion/sd_module_key/sd15_module_key.txt:
--------------------------------------------------------------------------------
  1 | conv_in
  2 | time_proj
  3 | time_embedding
  4 | time_embedding.linear_1
  5 | time_embedding.act
  6 | time_embedding.linear_2
  7 | down_blocks
  8 | down_blocks.0
  9 | down_blocks.0.attentions
 10 | down_blocks.0.attentions.0
 11 | down_blocks.0.attentions.0.norm
 12 | down_blocks.0.attentions.0.proj_in
 13 | down_blocks.0.attentions.0.transformer_blocks
 14 | down_blocks.0.attentions.0.transformer_blocks.0
 15 | down_blocks.0.attentions.0.transformer_blocks.0.norm1
 16 | down_blocks.0.attentions.0.transformer_blocks.0.attn1
 17 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q
 18 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k
 19 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v
 20 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out
 21 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0
 22 | down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.1
 23 | down_blocks.0.attentions.0.transformer_blocks.0.norm2
 24 | down_blocks.0.attentions.0.transformer_blocks.0.attn2
 25 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q
 26 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k
 27 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v
 28 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out
 29 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0
 30 | down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.1
 31 | down_blocks.0.attentions.0.transformer_blocks.0.norm3
 32 | down_blocks.0.attentions.0.transformer_blocks.0.ff
 33 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net
 34 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0
 35 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj
 36 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.1
 37 | down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2
 38 | down_blocks.0.attentions.0.proj_out
 39 | down_blocks.0.attentions.1
 40 | down_blocks.0.attentions.1.norm
 41 | down_blocks.0.attentions.1.proj_in
 42 | down_blocks.0.attentions.1.transformer_blocks
 43 | down_blocks.0.attentions.1.transformer_blocks.0
 44 | down_blocks.0.attentions.1.transformer_blocks.0.norm1
 45 | down_blocks.0.attentions.1.transformer_blocks.0.attn1
 46 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q
 47 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k
 48 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v
 49 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out
 50 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0
 51 | down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.1
 52 | down_blocks.0.attentions.1.transformer_blocks.0.norm2
 53 | down_blocks.0.attentions.1.transformer_blocks.0.attn2
 54 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q
 55 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k
 56 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v
 57 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out
 58 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0
 59 | down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.1
 60 | down_blocks.0.attentions.1.transformer_blocks.0.norm3
 61 | down_blocks.0.attentions.1.transformer_blocks.0.ff
 62 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net
 63 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0
 64 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj
 65 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.1
 66 | down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2
 67 | down_blocks.0.attentions.1.proj_out
 68 | down_blocks.0.resnets
 69 | down_blocks.0.resnets.0
 70 | down_blocks.0.resnets.0.norm1
 71 | down_blocks.0.resnets.0.conv1
 72 | down_blocks.0.resnets.0.time_emb_proj
 73 | down_blocks.0.resnets.0.norm2
 74 | down_blocks.0.resnets.0.dropout
 75 | down_blocks.0.resnets.0.conv2
 76 | down_blocks.0.resnets.1
 77 | down_blocks.0.resnets.1.norm1
 78 | down_blocks.0.resnets.1.conv1
 79 | down_blocks.0.resnets.1.time_emb_proj
 80 | down_blocks.0.resnets.1.norm2
 81 | down_blocks.0.resnets.1.dropout
 82 | down_blocks.0.resnets.1.conv2
 83 | down_blocks.0.downsamplers
 84 | down_blocks.0.downsamplers.0
 85 | down_blocks.0.downsamplers.0.conv
 86 | down_blocks.1
 87 | down_blocks.1.attentions
 88 | down_blocks.1.attentions.0
 89 | down_blocks.1.attentions.0.norm
 90 | down_blocks.1.attentions.0.proj_in
 91 | down_blocks.1.attentions.0.transformer_blocks
 92 | down_blocks.1.attentions.0.transformer_blocks.0
 93 | down_blocks.1.attentions.0.transformer_blocks.0.norm1
 94 | down_blocks.1.attentions.0.transformer_blocks.0.attn1
 95 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q
 96 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k
 97 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v
 98 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out
 99 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0
100 | down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.1
101 | down_blocks.1.attentions.0.transformer_blocks.0.norm2
102 | down_blocks.1.attentions.0.transformer_blocks.0.attn2
103 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q
104 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k
105 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v
106 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out
107 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0
108 | down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.1
109 | down_blocks.1.attentions.0.transformer_blocks.0.norm3
110 | down_blocks.1.attentions.0.transformer_blocks.0.ff
111 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net
112 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0
113 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj
114 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.1
115 | down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2
116 | down_blocks.1.attentions.0.proj_out
117 | down_blocks.1.attentions.1
118 | down_blocks.1.attentions.1.norm
119 | down_blocks.1.attentions.1.proj_in
120 | down_blocks.1.attentions.1.transformer_blocks
121 | down_blocks.1.attentions.1.transformer_blocks.0
122 | down_blocks.1.attentions.1.transformer_blocks.0.norm1
123 | down_blocks.1.attentions.1.transformer_blocks.0.attn1
124 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q
125 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k
126 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v
127 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out
128 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0
129 | down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.1
130 | down_blocks.1.attentions.1.transformer_blocks.0.norm2
131 | down_blocks.1.attentions.1.transformer_blocks.0.attn2
132 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q
133 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k
134 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v
135 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out
136 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0
137 | down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.1
138 | down_blocks.1.attentions.1.transformer_blocks.0.norm3
139 | down_blocks.1.attentions.1.transformer_blocks.0.ff
140 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net
141 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0
142 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj
143 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.1
144 | down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2
145 | down_blocks.1.attentions.1.proj_out
146 | down_blocks.1.resnets
147 | down_blocks.1.resnets.0
148 | down_blocks.1.resnets.0.norm1
149 | down_blocks.1.resnets.0.conv1
150 | down_blocks.1.resnets.0.time_emb_proj
151 | down_blocks.1.resnets.0.norm2
152 | down_blocks.1.resnets.0.dropout
153 | down_blocks.1.resnets.0.conv2
154 | down_blocks.1.resnets.0.conv_shortcut
155 | down_blocks.1.resnets.1
156 | down_blocks.1.resnets.1.norm1
157 | down_blocks.1.resnets.1.conv1
158 | down_blocks.1.resnets.1.time_emb_proj
159 | down_blocks.1.resnets.1.norm2
160 | down_blocks.1.resnets.1.dropout
161 | down_blocks.1.resnets.1.conv2
162 | down_blocks.1.downsamplers
163 | down_blocks.1.downsamplers.0
164 | down_blocks.1.downsamplers.0.conv
165 | down_blocks.2
166 | down_blocks.2.attentions
167 | down_blocks.2.attentions.0
168 | down_blocks.2.attentions.0.norm
169 | down_blocks.2.attentions.0.proj_in
170 | down_blocks.2.attentions.0.transformer_blocks
171 | down_blocks.2.attentions.0.transformer_blocks.0
172 | down_blocks.2.attentions.0.transformer_blocks.0.norm1
173 | down_blocks.2.attentions.0.transformer_blocks.0.attn1
174 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q
175 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k
176 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v
177 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out
178 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0
179 | down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.1
180 | down_blocks.2.attentions.0.transformer_blocks.0.norm2
181 | down_blocks.2.attentions.0.transformer_blocks.0.attn2
182 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q
183 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k
184 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v
185 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out
186 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0
187 | down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.1
188 | down_blocks.2.attentions.0.transformer_blocks.0.norm3
189 | down_blocks.2.attentions.0.transformer_blocks.0.ff
190 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net
191 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0
192 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj
193 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.1
194 | down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2
195 | down_blocks.2.attentions.0.proj_out
196 | down_blocks.2.attentions.1
197 | down_blocks.2.attentions.1.norm
198 | down_blocks.2.attentions.1.proj_in
199 | down_blocks.2.attentions.1.transformer_blocks
200 | down_blocks.2.attentions.1.transformer_blocks.0
201 | down_blocks.2.attentions.1.transformer_blocks.0.norm1
202 | down_blocks.2.attentions.1.transformer_blocks.0.attn1
203 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q
204 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k
205 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v
206 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out
207 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0
208 | down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.1
209 | down_blocks.2.attentions.1.transformer_blocks.0.norm2
210 | down_blocks.2.attentions.1.transformer_blocks.0.attn2
211 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q
212 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k
213 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v
214 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out
215 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0
216 | down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.1
217 | down_blocks.2.attentions.1.transformer_blocks.0.norm3
218 | down_blocks.2.attentions.1.transformer_blocks.0.ff
219 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net
220 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0
221 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj
222 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.1
223 | down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2
224 | down_blocks.2.attentions.1.proj_out
225 | down_blocks.2.resnets
226 | down_blocks.2.resnets.0
227 | down_blocks.2.resnets.0.norm1
228 | down_blocks.2.resnets.0.conv1
229 | down_blocks.2.resnets.0.time_emb_proj
230 | down_blocks.2.resnets.0.norm2
231 | down_blocks.2.resnets.0.dropout
232 | down_blocks.2.resnets.0.conv2
233 | down_blocks.2.resnets.0.conv_shortcut
234 | down_blocks.2.resnets.1
235 | down_blocks.2.resnets.1.norm1
236 | down_blocks.2.resnets.1.conv1
237 | down_blocks.2.resnets.1.time_emb_proj
238 | down_blocks.2.resnets.1.norm2
239 | down_blocks.2.resnets.1.dropout
240 | down_blocks.2.resnets.1.conv2
241 | down_blocks.2.downsamplers
242 | down_blocks.2.downsamplers.0
243 | down_blocks.2.downsamplers.0.conv
244 | down_blocks.3
245 | down_blocks.3.resnets
246 | down_blocks.3.resnets.0
247 | down_blocks.3.resnets.0.norm1
248 | down_blocks.3.resnets.0.conv1
249 | down_blocks.3.resnets.0.time_emb_proj
250 | down_blocks.3.resnets.0.norm2
251 | down_blocks.3.resnets.0.dropout
252 | down_blocks.3.resnets.0.conv2
253 | down_blocks.3.resnets.1
254 | down_blocks.3.resnets.1.norm1
255 | down_blocks.3.resnets.1.conv1
256 | down_blocks.3.resnets.1.time_emb_proj
257 | down_blocks.3.resnets.1.norm2
258 | down_blocks.3.resnets.1.dropout
259 | down_blocks.3.resnets.1.conv2
260 | up_blocks
261 | up_blocks.0
262 | up_blocks.0.resnets
263 | up_blocks.0.resnets.0
264 | up_blocks.0.resnets.0.norm1
265 | up_blocks.0.resnets.0.conv1
266 | up_blocks.0.resnets.0.time_emb_proj
267 | up_blocks.0.resnets.0.norm2
268 | up_blocks.0.resnets.0.dropout
269 | up_blocks.0.resnets.0.conv2
270 | up_blocks.0.resnets.0.conv_shortcut
271 | up_blocks.0.resnets.1
272 | up_blocks.0.resnets.1.norm1
273 | up_blocks.0.resnets.1.conv1
274 | up_blocks.0.resnets.1.time_emb_proj
275 | up_blocks.0.resnets.1.norm2
276 | up_blocks.0.resnets.1.dropout
277 | up_blocks.0.resnets.1.conv2
278 | up_blocks.0.resnets.1.conv_shortcut
279 | up_blocks.0.resnets.2
280 | up_blocks.0.resnets.2.norm1
281 | up_blocks.0.resnets.2.conv1
282 | up_blocks.0.resnets.2.time_emb_proj
283 | up_blocks.0.resnets.2.norm2
284 | up_blocks.0.resnets.2.dropout
285 | up_blocks.0.resnets.2.conv2
286 | up_blocks.0.resnets.2.conv_shortcut
287 | up_blocks.0.upsamplers
288 | up_blocks.0.upsamplers.0
289 | up_blocks.0.upsamplers.0.conv
290 | up_blocks.1
291 | up_blocks.1.attentions
292 | up_blocks.1.attentions.0
293 | up_blocks.1.attentions.0.norm
294 | up_blocks.1.attentions.0.proj_in
295 | up_blocks.1.attentions.0.transformer_blocks
296 | up_blocks.1.attentions.0.transformer_blocks.0
297 | up_blocks.1.attentions.0.transformer_blocks.0.norm1
298 | up_blocks.1.attentions.0.transformer_blocks.0.attn1
299 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q
300 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k
301 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v
302 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out
303 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0
304 | up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.1
305 | up_blocks.1.attentions.0.transformer_blocks.0.norm2
306 | up_blocks.1.attentions.0.transformer_blocks.0.attn2
307 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q
308 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k
309 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v
310 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out
311 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0
312 | up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.1
313 | up_blocks.1.attentions.0.transformer_blocks.0.norm3
314 | up_blocks.1.attentions.0.transformer_blocks.0.ff
315 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net
316 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0
317 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj
318 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.1
319 | up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2
320 | up_blocks.1.attentions.0.proj_out
321 | up_blocks.1.attentions.1
322 | up_blocks.1.attentions.1.norm
323 | up_blocks.1.attentions.1.proj_in
324 | up_blocks.1.attentions.1.transformer_blocks
325 | up_blocks.1.attentions.1.transformer_blocks.0
326 | up_blocks.1.attentions.1.transformer_blocks.0.norm1
327 | up_blocks.1.attentions.1.transformer_blocks.0.attn1
328 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q
329 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k
330 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v
331 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out
332 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0
333 | up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.1
334 | up_blocks.1.attentions.1.transformer_blocks.0.norm2
335 | up_blocks.1.attentions.1.transformer_blocks.0.attn2
336 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q
337 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k
338 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v
339 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out
340 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0
341 | up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.1
342 | up_blocks.1.attentions.1.transformer_blocks.0.norm3
343 | up_blocks.1.attentions.1.transformer_blocks.0.ff
344 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net
345 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0
346 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj
347 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.1
348 | up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2
349 | up_blocks.1.attentions.1.proj_out
350 | up_blocks.1.attentions.2
351 | up_blocks.1.attentions.2.norm
352 | up_blocks.1.attentions.2.proj_in
353 | up_blocks.1.attentions.2.transformer_blocks
354 | up_blocks.1.attentions.2.transformer_blocks.0
355 | up_blocks.1.attentions.2.transformer_blocks.0.norm1
356 | up_blocks.1.attentions.2.transformer_blocks.0.attn1
357 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q
358 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k
359 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v
360 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out
361 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0
362 | up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.1
363 | up_blocks.1.attentions.2.transformer_blocks.0.norm2
364 | up_blocks.1.attentions.2.transformer_blocks.0.attn2
365 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q
366 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k
367 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v
368 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out
369 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0
370 | up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.1
371 | up_blocks.1.attentions.2.transformer_blocks.0.norm3
372 | up_blocks.1.attentions.2.transformer_blocks.0.ff
373 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net
374 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0
375 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj
376 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.1
377 | up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2
378 | up_blocks.1.attentions.2.proj_out
379 | up_blocks.1.resnets
380 | up_blocks.1.resnets.0
381 | up_blocks.1.resnets.0.norm1
382 | up_blocks.1.resnets.0.conv1
383 | up_blocks.1.resnets.0.time_emb_proj
384 | up_blocks.1.resnets.0.norm2
385 | up_blocks.1.resnets.0.dropout
386 | up_blocks.1.resnets.0.conv2
387 | up_blocks.1.resnets.0.conv_shortcut
388 | up_blocks.1.resnets.1
389 | up_blocks.1.resnets.1.norm1
390 | up_blocks.1.resnets.1.conv1
391 | up_blocks.1.resnets.1.time_emb_proj
392 | up_blocks.1.resnets.1.norm2
393 | up_blocks.1.resnets.1.dropout
394 | up_blocks.1.resnets.1.conv2
395 | up_blocks.1.resnets.1.conv_shortcut
396 | up_blocks.1.resnets.2
397 | up_blocks.1.resnets.2.norm1
398 | up_blocks.1.resnets.2.conv1
399 | up_blocks.1.resnets.2.time_emb_proj
400 | up_blocks.1.resnets.2.norm2
401 | up_blocks.1.resnets.2.dropout
402 | up_blocks.1.resnets.2.conv2
403 | up_blocks.1.resnets.2.conv_shortcut
404 | up_blocks.1.upsamplers
405 | up_blocks.1.upsamplers.0
406 | up_blocks.1.upsamplers.0.conv
407 | up_blocks.2
408 | up_blocks.2.attentions
409 | up_blocks.2.attentions.0
410 | up_blocks.2.attentions.0.norm
411 | up_blocks.2.attentions.0.proj_in
412 | up_blocks.2.attentions.0.transformer_blocks
413 | up_blocks.2.attentions.0.transformer_blocks.0
414 | up_blocks.2.attentions.0.transformer_blocks.0.norm1
415 | up_blocks.2.attentions.0.transformer_blocks.0.attn1
416 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q
417 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k
418 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v
419 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out
420 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0
421 | up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.1
422 | up_blocks.2.attentions.0.transformer_blocks.0.norm2
423 | up_blocks.2.attentions.0.transformer_blocks.0.attn2
424 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q
425 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k
426 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v
427 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out
428 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0
429 | up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.1
430 | up_blocks.2.attentions.0.transformer_blocks.0.norm3
431 | up_blocks.2.attentions.0.transformer_blocks.0.ff
432 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net
433 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0
434 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj
435 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.1
436 | up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2
437 | up_blocks.2.attentions.0.proj_out
438 | up_blocks.2.attentions.1
439 | up_blocks.2.attentions.1.norm
440 | up_blocks.2.attentions.1.proj_in
441 | up_blocks.2.attentions.1.transformer_blocks
442 | up_blocks.2.attentions.1.transformer_blocks.0
443 | up_blocks.2.attentions.1.transformer_blocks.0.norm1
444 | up_blocks.2.attentions.1.transformer_blocks.0.attn1
445 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q
446 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k
447 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v
448 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out
449 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0
450 | up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.1
451 | up_blocks.2.attentions.1.transformer_blocks.0.norm2
452 | up_blocks.2.attentions.1.transformer_blocks.0.attn2
453 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q
454 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k
455 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v
456 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out
457 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0
458 | up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.1
459 | up_blocks.2.attentions.1.transformer_blocks.0.norm3
460 | up_blocks.2.attentions.1.transformer_blocks.0.ff
461 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net
462 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0
463 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj
464 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.1
465 | up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2
466 | up_blocks.2.attentions.1.proj_out
467 | up_blocks.2.attentions.2
468 | up_blocks.2.attentions.2.norm
469 | up_blocks.2.attentions.2.proj_in
470 | up_blocks.2.attentions.2.transformer_blocks
471 | up_blocks.2.attentions.2.transformer_blocks.0
472 | up_blocks.2.attentions.2.transformer_blocks.0.norm1
473 | up_blocks.2.attentions.2.transformer_blocks.0.attn1
474 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q
475 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k
476 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v
477 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out
478 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0
479 | up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.1
480 | up_blocks.2.attentions.2.transformer_blocks.0.norm2
481 | up_blocks.2.attentions.2.transformer_blocks.0.attn2
482 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q
483 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k
484 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v
485 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out
486 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0
487 | up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.1
488 | up_blocks.2.attentions.2.transformer_blocks.0.norm3
489 | up_blocks.2.attentions.2.transformer_blocks.0.ff
490 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net
491 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0
492 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj
493 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.1
494 | up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2
495 | up_blocks.2.attentions.2.proj_out
496 | up_blocks.2.resnets
497 | up_blocks.2.resnets.0
498 | up_blocks.2.resnets.0.norm1
499 | up_blocks.2.resnets.0.conv1
500 | up_blocks.2.resnets.0.time_emb_proj
501 | up_blocks.2.resnets.0.norm2
502 | up_blocks.2.resnets.0.dropout
503 | up_blocks.2.resnets.0.conv2
504 | up_blocks.2.resnets.0.conv_shortcut
505 | up_blocks.2.resnets.1
506 | up_blocks.2.resnets.1.norm1
507 | up_blocks.2.resnets.1.conv1
508 | up_blocks.2.resnets.1.time_emb_proj
509 | up_blocks.2.resnets.1.norm2
510 | up_blocks.2.resnets.1.dropout
511 | up_blocks.2.resnets.1.conv2
512 | up_blocks.2.resnets.1.conv_shortcut
513 | up_blocks.2.resnets.2
514 | up_blocks.2.resnets.2.norm1
515 | up_blocks.2.resnets.2.conv1
516 | up_blocks.2.resnets.2.time_emb_proj
517 | up_blocks.2.resnets.2.norm2
518 | up_blocks.2.resnets.2.dropout
519 | up_blocks.2.resnets.2.conv2
520 | up_blocks.2.resnets.2.conv_shortcut
521 | up_blocks.2.upsamplers
522 | up_blocks.2.upsamplers.0
523 | up_blocks.2.upsamplers.0.conv
524 | up_blocks.3
525 | up_blocks.3.attentions
526 | up_blocks.3.attentions.0
527 | up_blocks.3.attentions.0.norm
528 | up_blocks.3.attentions.0.proj_in
529 | up_blocks.3.attentions.0.transformer_blocks
530 | up_blocks.3.attentions.0.transformer_blocks.0
531 | up_blocks.3.attentions.0.transformer_blocks.0.norm1
532 | up_blocks.3.attentions.0.transformer_blocks.0.attn1
533 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q
534 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k
535 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v
536 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out
537 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0
538 | up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.1
539 | up_blocks.3.attentions.0.transformer_blocks.0.norm2
540 | up_blocks.3.attentions.0.transformer_blocks.0.attn2
541 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q
542 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k
543 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v
544 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out
545 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0
546 | up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.1
547 | up_blocks.3.attentions.0.transformer_blocks.0.norm3
548 | up_blocks.3.attentions.0.transformer_blocks.0.ff
549 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net
550 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0
551 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj
552 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.1
553 | up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2
554 | up_blocks.3.attentions.0.proj_out
555 | up_blocks.3.attentions.1
556 | up_blocks.3.attentions.1.norm
557 | up_blocks.3.attentions.1.proj_in
558 | up_blocks.3.attentions.1.transformer_blocks
559 | up_blocks.3.attentions.1.transformer_blocks.0
560 | up_blocks.3.attentions.1.transformer_blocks.0.norm1
561 | up_blocks.3.attentions.1.transformer_blocks.0.attn1
562 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q
563 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k
564 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v
565 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out
566 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0
567 | up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.1
568 | up_blocks.3.attentions.1.transformer_blocks.0.norm2
569 | up_blocks.3.attentions.1.transformer_blocks.0.attn2
570 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q
571 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k
572 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v
573 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out
574 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0
575 | up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.1
576 | up_blocks.3.attentions.1.transformer_blocks.0.norm3
577 | up_blocks.3.attentions.1.transformer_blocks.0.ff
578 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net
579 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0
580 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj
581 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.1
582 | up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2
583 | up_blocks.3.attentions.1.proj_out
584 | up_blocks.3.attentions.2
585 | up_blocks.3.attentions.2.norm
586 | up_blocks.3.attentions.2.proj_in
587 | up_blocks.3.attentions.2.transformer_blocks
588 | up_blocks.3.attentions.2.transformer_blocks.0
589 | up_blocks.3.attentions.2.transformer_blocks.0.norm1
590 | up_blocks.3.attentions.2.transformer_blocks.0.attn1
591 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q
592 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k
593 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v
594 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out
595 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0
596 | up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.1
597 | up_blocks.3.attentions.2.transformer_blocks.0.norm2
598 | up_blocks.3.attentions.2.transformer_blocks.0.attn2
599 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q
600 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k
601 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v
602 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out
603 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0
604 | up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.1
605 | up_blocks.3.attentions.2.transformer_blocks.0.norm3
606 | up_blocks.3.attentions.2.transformer_blocks.0.ff
607 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net
608 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0
609 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj
610 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.1
611 | up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2
612 | up_blocks.3.attentions.2.proj_out
613 | up_blocks.3.resnets
614 | up_blocks.3.resnets.0
615 | up_blocks.3.resnets.0.norm1
616 | up_blocks.3.resnets.0.conv1
617 | up_blocks.3.resnets.0.time_emb_proj
618 | up_blocks.3.resnets.0.norm2
619 | up_blocks.3.resnets.0.dropout
620 | up_blocks.3.resnets.0.conv2
621 | up_blocks.3.resnets.0.conv_shortcut
622 | up_blocks.3.resnets.1
623 | up_blocks.3.resnets.1.norm1
624 | up_blocks.3.resnets.1.conv1
625 | up_blocks.3.resnets.1.time_emb_proj
626 | up_blocks.3.resnets.1.norm2
627 | up_blocks.3.resnets.1.dropout
628 | up_blocks.3.resnets.1.conv2
629 | up_blocks.3.resnets.1.conv_shortcut
630 | up_blocks.3.resnets.2
631 | up_blocks.3.resnets.2.norm1
632 | up_blocks.3.resnets.2.conv1
633 | up_blocks.3.resnets.2.time_emb_proj
634 | up_blocks.3.resnets.2.norm2
635 | up_blocks.3.resnets.2.dropout
636 | up_blocks.3.resnets.2.conv2
637 | up_blocks.3.resnets.2.conv_shortcut
638 | mid_block
639 | mid_block.attentions
640 | mid_block.attentions.0
641 | mid_block.attentions.0.norm
642 | mid_block.attentions.0.proj_in
643 | mid_block.attentions.0.transformer_blocks
644 | mid_block.attentions.0.transformer_blocks.0
645 | mid_block.attentions.0.transformer_blocks.0.norm1
646 | mid_block.attentions.0.transformer_blocks.0.attn1
647 | mid_block.attentions.0.transformer_blocks.0.attn1.to_q
648 | mid_block.attentions.0.transformer_blocks.0.attn1.to_k
649 | mid_block.attentions.0.transformer_blocks.0.attn1.to_v
650 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out
651 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0
652 | mid_block.attentions.0.transformer_blocks.0.attn1.to_out.1
653 | mid_block.attentions.0.transformer_blocks.0.norm2
654 | mid_block.attentions.0.transformer_blocks.0.attn2
655 | mid_block.attentions.0.transformer_blocks.0.attn2.to_q
656 | mid_block.attentions.0.transformer_blocks.0.attn2.to_k
657 | mid_block.attentions.0.transformer_blocks.0.attn2.to_v
658 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out
659 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0
660 | mid_block.attentions.0.transformer_blocks.0.attn2.to_out.1
661 | mid_block.attentions.0.transformer_blocks.0.norm3
662 | mid_block.attentions.0.transformer_blocks.0.ff
663 | mid_block.attentions.0.transformer_blocks.0.ff.net
664 | mid_block.attentions.0.transformer_blocks.0.ff.net.0
665 | mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj
666 | mid_block.attentions.0.transformer_blocks.0.ff.net.1
667 | mid_block.attentions.0.transformer_blocks.0.ff.net.2
668 | mid_block.attentions.0.proj_out
669 | mid_block.resnets
670 | mid_block.resnets.0
671 | mid_block.resnets.0.norm1
672 | mid_block.resnets.0.conv1
673 | mid_block.resnets.0.time_emb_proj
674 | mid_block.resnets.0.norm2
675 | mid_block.resnets.0.dropout
676 | mid_block.resnets.0.conv2
677 | mid_block.resnets.1
678 | mid_block.resnets.1.norm1
679 | mid_block.resnets.1.conv1
680 | mid_block.resnets.1.time_emb_proj
681 | mid_block.resnets.1.norm2
682 | mid_block.resnets.1.dropout
683 | mid_block.resnets.1.conv2
684 | conv_norm_out
685 | conv_out
686 | 


--------------------------------------------------------------------------------
/hidiffusion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def isinstance_str(x: object, cls_name: str):
 5 |     """
 6 |     Checks whether x has any class *named* cls_name in its ancestry.
 7 |     Doesn't require access to the class's implementation.
 8 |     
 9 |     Useful for patching!
10 |     """
11 | 
12 |     for _cls in x.__class__.__mro__:
13 |         if _cls.__name__ == cls_name:
14 |             return True
15 |     
16 |     return False
17 | 
18 | 
19 | def init_generator(device: torch.device, fallback: torch.Generator=None):
20 |     """
21 |     Forks the current default random generator given device.
22 |     """
23 |     if device.type == "cpu":
24 |         return torch.Generator(device="cpu").set_state(torch.get_rng_state())
25 |     elif device.type == "cuda":
26 |         return torch.Generator(device=device).set_state(torch.cuda.get_rng_state())
27 |     else:
28 |         if fallback is None:
29 |             return init_generator(torch.device("cpu"))
30 |         else:
31 |             return fallback
32 |     


--------------------------------------------------------------------------------
/ip_adapter/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull
 2 | 
 3 | __all__ = [
 4 |     "IPAdapter",
 5 |     "IPAdapterPlus",
 6 |     "IPAdapterPlusXL",
 7 |     "IPAdapterXL",
 8 |     "IPAdapterFull",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/ip_adapter/attention_processor.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class AttnProcessor(nn.Module):
  8 |     r"""
  9 |     Default processor for performing attention-related computations.
 10 |     """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         hidden_size=None,
 15 |         cross_attention_dim=None,
 16 |     ):
 17 |         super().__init__()
 18 | 
 19 |     def __call__(
 20 |         self,
 21 |         attn,
 22 |         hidden_states,
 23 |         encoder_hidden_states=None,
 24 |         attention_mask=None,
 25 |         temb=None,
 26 |     ):
 27 |         residual = hidden_states
 28 | 
 29 |         if attn.spatial_norm is not None:
 30 |             hidden_states = attn.spatial_norm(hidden_states, temb)
 31 | 
 32 |         input_ndim = hidden_states.ndim
 33 | 
 34 |         if input_ndim == 4:
 35 |             batch_size, channel, height, width = hidden_states.shape
 36 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
 37 | 
 38 |         batch_size, sequence_length, _ = (
 39 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
 40 |         )
 41 |         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 42 | 
 43 |         if attn.group_norm is not None:
 44 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 45 | 
 46 |         query = attn.to_q(hidden_states)
 47 | 
 48 |         if encoder_hidden_states is None:
 49 |             encoder_hidden_states = hidden_states
 50 |         elif attn.norm_cross:
 51 |             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 52 | 
 53 |         key = attn.to_k(encoder_hidden_states)
 54 |         value = attn.to_v(encoder_hidden_states)
 55 | 
 56 |         query = attn.head_to_batch_dim(query)
 57 |         key = attn.head_to_batch_dim(key)
 58 |         value = attn.head_to_batch_dim(value)
 59 | 
 60 |         attention_probs = attn.get_attention_scores(query, key, attention_mask)
 61 |         hidden_states = torch.bmm(attention_probs, value)
 62 |         hidden_states = attn.batch_to_head_dim(hidden_states)
 63 | 
 64 |         # linear proj
 65 |         hidden_states = attn.to_out[0](hidden_states)
 66 |         # dropout
 67 |         hidden_states = attn.to_out[1](hidden_states)
 68 | 
 69 |         if input_ndim == 4:
 70 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
 71 | 
 72 |         if attn.residual_connection:
 73 |             hidden_states = hidden_states + residual
 74 | 
 75 |         hidden_states = hidden_states / attn.rescale_output_factor
 76 | 
 77 |         return hidden_states
 78 | 
 79 | 
 80 | class IPAttnProcessor(nn.Module):
 81 |     r"""
 82 |     Attention processor for IP-Adapater.
 83 |     Args:
 84 |         hidden_size (`int`):
 85 |             The hidden size of the attention layer.
 86 |         cross_attention_dim (`int`):
 87 |             The number of channels in the `encoder_hidden_states`.
 88 |         scale (`float`, defaults to 1.0):
 89 |             the weight scale of image prompt.
 90 |         num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
 91 |             The context length of the image features.
 92 |     """
 93 | 
 94 |     def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False):
 95 |         super().__init__()
 96 | 
 97 |         self.hidden_size = hidden_size
 98 |         self.cross_attention_dim = cross_attention_dim
 99 |         self.scale = scale
100 |         self.num_tokens = num_tokens
101 |         self.skip = skip
102 | 
103 |         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
104 |         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
105 | 
106 |     def __call__(
107 |         self,
108 |         attn,
109 |         hidden_states,
110 |         encoder_hidden_states=None,
111 |         attention_mask=None,
112 |         temb=None,
113 |     ):
114 |         residual = hidden_states
115 | 
116 |         if attn.spatial_norm is not None:
117 |             hidden_states = attn.spatial_norm(hidden_states, temb)
118 | 
119 |         input_ndim = hidden_states.ndim
120 | 
121 |         if input_ndim == 4:
122 |             batch_size, channel, height, width = hidden_states.shape
123 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
124 | 
125 |         batch_size, sequence_length, _ = (
126 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
127 |         )
128 |         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
129 | 
130 |         if attn.group_norm is not None:
131 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
132 | 
133 |         query = attn.to_q(hidden_states)
134 | 
135 |         if encoder_hidden_states is None:
136 |             encoder_hidden_states = hidden_states
137 |         else:
138 |             # get encoder_hidden_states, ip_hidden_states
139 |             end_pos = encoder_hidden_states.shape[1] - self.num_tokens
140 |             encoder_hidden_states, ip_hidden_states = (
141 |                 encoder_hidden_states[:, :end_pos, :],
142 |                 encoder_hidden_states[:, end_pos:, :],
143 |             )
144 |             if attn.norm_cross:
145 |                 encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
146 | 
147 |         key = attn.to_k(encoder_hidden_states)
148 |         value = attn.to_v(encoder_hidden_states)
149 | 
150 |         query = attn.head_to_batch_dim(query)
151 |         key = attn.head_to_batch_dim(key)
152 |         value = attn.head_to_batch_dim(value)
153 | 
154 |         attention_probs = attn.get_attention_scores(query, key, attention_mask)
155 |         hidden_states = torch.bmm(attention_probs, value)
156 |         hidden_states = attn.batch_to_head_dim(hidden_states)
157 | 
158 |         if not self.skip:
159 |             # for ip-adapter
160 |             ip_key = self.to_k_ip(ip_hidden_states)
161 |             ip_value = self.to_v_ip(ip_hidden_states)
162 | 
163 |             ip_key = attn.head_to_batch_dim(ip_key)
164 |             ip_value = attn.head_to_batch_dim(ip_value)
165 | 
166 |             ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
167 |             self.attn_map = ip_attention_probs
168 |             ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
169 |             ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
170 | 
171 |             hidden_states = hidden_states + self.scale * ip_hidden_states
172 | 
173 |         # linear proj
174 |         hidden_states = attn.to_out[0](hidden_states)
175 |         # dropout
176 |         hidden_states = attn.to_out[1](hidden_states)
177 | 
178 |         if input_ndim == 4:
179 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
180 | 
181 |         if attn.residual_connection:
182 |             hidden_states = hidden_states + residual
183 | 
184 |         hidden_states = hidden_states / attn.rescale_output_factor
185 | 
186 |         return hidden_states
187 | 
188 | 
189 | class AttnProcessor2_0(torch.nn.Module):
190 |     r"""
191 |     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
192 |     """
193 | 
194 |     def __init__(
195 |         self,
196 |         hidden_size=None,
197 |         cross_attention_dim=None,
198 |     ):
199 |         super().__init__()
200 |         if not hasattr(F, "scaled_dot_product_attention"):
201 |             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
202 | 
203 |     def __call__(
204 |         self,
205 |         attn,
206 |         hidden_states,
207 |         encoder_hidden_states=None,
208 |         attention_mask=None,
209 |         temb=None,
210 |     ):
211 |         residual = hidden_states
212 | 
213 |         if attn.spatial_norm is not None:
214 |             hidden_states = attn.spatial_norm(hidden_states, temb)
215 | 
216 |         input_ndim = hidden_states.ndim
217 | 
218 |         if input_ndim == 4:
219 |             batch_size, channel, height, width = hidden_states.shape
220 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
221 | 
222 |         batch_size, sequence_length, _ = (
223 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
224 |         )
225 | 
226 |         if attention_mask is not None:
227 |             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
228 |             # scaled_dot_product_attention expects attention_mask shape to be
229 |             # (batch, heads, source_length, target_length)
230 |             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
231 | 
232 |         if attn.group_norm is not None:
233 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
234 | 
235 |         query = attn.to_q(hidden_states)
236 | 
237 |         if encoder_hidden_states is None:
238 |             encoder_hidden_states = hidden_states
239 |         elif attn.norm_cross:
240 |             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
241 | 
242 |         key = attn.to_k(encoder_hidden_states)
243 |         value = attn.to_v(encoder_hidden_states)
244 | 
245 |         inner_dim = key.shape[-1]
246 |         head_dim = inner_dim // attn.heads
247 | 
248 |         query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
249 | 
250 |         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
251 |         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
252 | 
253 |         # the output of sdp = (batch, num_heads, seq_len, head_dim)
254 |         # TODO: add support for attn.scale when we move to Torch 2.1
255 |         hidden_states = F.scaled_dot_product_attention(
256 |             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
257 |         )
258 | 
259 |         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
260 |         hidden_states = hidden_states.to(query.dtype)
261 | 
262 |         # linear proj
263 |         hidden_states = attn.to_out[0](hidden_states)
264 |         # dropout
265 |         hidden_states = attn.to_out[1](hidden_states)
266 | 
267 |         if input_ndim == 4:
268 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
269 | 
270 |         if attn.residual_connection:
271 |             hidden_states = hidden_states + residual
272 | 
273 |         hidden_states = hidden_states / attn.rescale_output_factor
274 | 
275 |         return hidden_states
276 | 
277 | 
278 | class IPAttnProcessor2_0(torch.nn.Module):
279 |     r"""
280 |     Attention processor for IP-Adapater for PyTorch 2.0.
281 |     Args:
282 |         hidden_size (`int`):
283 |             The hidden size of the attention layer.
284 |         cross_attention_dim (`int`):
285 |             The number of channels in the `encoder_hidden_states`.
286 |         scale (`float`, defaults to 1.0):
287 |             the weight scale of image prompt.
288 |         num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
289 |             The context length of the image features.
290 |     """
291 | 
292 |     def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False):
293 |         super().__init__()
294 | 
295 |         if not hasattr(F, "scaled_dot_product_attention"):
296 |             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
297 | 
298 |         self.hidden_size = hidden_size
299 |         self.cross_attention_dim = cross_attention_dim
300 |         self.scale = scale
301 |         self.num_tokens = num_tokens
302 |         self.skip = skip
303 | 
304 |         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
305 |         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
306 | 
307 |     def __call__(
308 |         self,
309 |         attn,
310 |         hidden_states,
311 |         encoder_hidden_states=None,
312 |         attention_mask=None,
313 |         temb=None,
314 |     ):
315 |         residual = hidden_states
316 | 
317 |         if attn.spatial_norm is not None:
318 |             hidden_states = attn.spatial_norm(hidden_states, temb)
319 | 
320 |         input_ndim = hidden_states.ndim
321 | 
322 |         if input_ndim == 4:
323 |             batch_size, channel, height, width = hidden_states.shape
324 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
325 | 
326 |         batch_size, sequence_length, _ = (
327 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
328 |         )
329 | 
330 |         if attention_mask is not None:
331 |             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
332 |             # scaled_dot_product_attention expects attention_mask shape to be
333 |             # (batch, heads, source_length, target_length)
334 |             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
335 | 
336 |         if attn.group_norm is not None:
337 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
338 | 
339 |         query = attn.to_q(hidden_states)
340 | 
341 |         if encoder_hidden_states is None:
342 |             encoder_hidden_states = hidden_states
343 |         else:
344 |             # get encoder_hidden_states, ip_hidden_states
345 |             end_pos = encoder_hidden_states.shape[1] - self.num_tokens
346 |             encoder_hidden_states, ip_hidden_states = (
347 |                 encoder_hidden_states[:, :end_pos, :],
348 |                 encoder_hidden_states[:, end_pos:, :],
349 |             )
350 |             if attn.norm_cross:
351 |                 encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
352 | 
353 |         key = attn.to_k(encoder_hidden_states)
354 |         value = attn.to_v(encoder_hidden_states)
355 | 
356 |         inner_dim = key.shape[-1]
357 |         head_dim = inner_dim // attn.heads
358 | 
359 |         query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
360 | 
361 |         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
362 |         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
363 | 
364 |         # the output of sdp = (batch, num_heads, seq_len, head_dim)
365 |         # TODO: add support for attn.scale when we move to Torch 2.1
366 |         hidden_states = F.scaled_dot_product_attention(
367 |             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
368 |         )
369 | 
370 |         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
371 |         hidden_states = hidden_states.to(query.dtype)
372 | 
373 |         if not self.skip:
374 |             # for ip-adapter
375 |             ip_key = self.to_k_ip(ip_hidden_states)
376 |             ip_value = self.to_v_ip(ip_hidden_states)
377 | 
378 |             ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
379 |             ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
380 | 
381 |             # the output of sdp = (batch, num_heads, seq_len, head_dim)
382 |             # TODO: add support for attn.scale when we move to Torch 2.1
383 |             ip_hidden_states = F.scaled_dot_product_attention(
384 |                 query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
385 |             )
386 |             with torch.no_grad():
387 |                 self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
388 |                 #print(self.attn_map.shape)
389 | 
390 |             ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
391 |             ip_hidden_states = ip_hidden_states.to(query.dtype)
392 | 
393 |             hidden_states = hidden_states + self.scale * ip_hidden_states
394 | 
395 |         # linear proj
396 |         hidden_states = attn.to_out[0](hidden_states)
397 |         # dropout
398 |         hidden_states = attn.to_out[1](hidden_states)
399 | 
400 |         if input_ndim == 4:
401 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
402 | 
403 |         if attn.residual_connection:
404 |             hidden_states = hidden_states + residual
405 | 
406 |         hidden_states = hidden_states / attn.rescale_output_factor
407 | 
408 |         return hidden_states
409 | 
410 | 
411 | ## for controlnet
412 | class CNAttnProcessor:
413 |     r"""
414 |     Default processor for performing attention-related computations.
415 |     """
416 | 
417 |     def __init__(self, num_tokens=4):
418 |         self.num_tokens = num_tokens
419 | 
420 |     def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
421 |         residual = hidden_states
422 | 
423 |         if attn.spatial_norm is not None:
424 |             hidden_states = attn.spatial_norm(hidden_states, temb)
425 | 
426 |         input_ndim = hidden_states.ndim
427 | 
428 |         if input_ndim == 4:
429 |             batch_size, channel, height, width = hidden_states.shape
430 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
431 | 
432 |         batch_size, sequence_length, _ = (
433 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
434 |         )
435 |         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
436 | 
437 |         if attn.group_norm is not None:
438 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
439 | 
440 |         query = attn.to_q(hidden_states)
441 | 
442 |         if encoder_hidden_states is None:
443 |             encoder_hidden_states = hidden_states
444 |         else:
445 |             end_pos = encoder_hidden_states.shape[1] - self.num_tokens
446 |             encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
447 |             if attn.norm_cross:
448 |                 encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
449 | 
450 |         key = attn.to_k(encoder_hidden_states)
451 |         value = attn.to_v(encoder_hidden_states)
452 | 
453 |         query = attn.head_to_batch_dim(query)
454 |         key = attn.head_to_batch_dim(key)
455 |         value = attn.head_to_batch_dim(value)
456 | 
457 |         attention_probs = attn.get_attention_scores(query, key, attention_mask)
458 |         hidden_states = torch.bmm(attention_probs, value)
459 |         hidden_states = attn.batch_to_head_dim(hidden_states)
460 | 
461 |         # linear proj
462 |         hidden_states = attn.to_out[0](hidden_states)
463 |         # dropout
464 |         hidden_states = attn.to_out[1](hidden_states)
465 | 
466 |         if input_ndim == 4:
467 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
468 | 
469 |         if attn.residual_connection:
470 |             hidden_states = hidden_states + residual
471 | 
472 |         hidden_states = hidden_states / attn.rescale_output_factor
473 | 
474 |         return hidden_states
475 | 
476 | 
477 | class CNAttnProcessor2_0:
478 |     r"""
479 |     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
480 |     """
481 | 
482 |     def __init__(self, num_tokens=4):
483 |         if not hasattr(F, "scaled_dot_product_attention"):
484 |             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
485 |         self.num_tokens = num_tokens
486 | 
487 |     def __call__(
488 |         self,
489 |         attn,
490 |         hidden_states,
491 |         encoder_hidden_states=None,
492 |         attention_mask=None,
493 |         temb=None,
494 |     ):
495 |         residual = hidden_states
496 | 
497 |         if attn.spatial_norm is not None:
498 |             hidden_states = attn.spatial_norm(hidden_states, temb)
499 | 
500 |         input_ndim = hidden_states.ndim
501 | 
502 |         if input_ndim == 4:
503 |             batch_size, channel, height, width = hidden_states.shape
504 |             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
505 | 
506 |         batch_size, sequence_length, _ = (
507 |             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
508 |         )
509 | 
510 |         if attention_mask is not None:
511 |             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
512 |             # scaled_dot_product_attention expects attention_mask shape to be
513 |             # (batch, heads, source_length, target_length)
514 |             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
515 | 
516 |         if attn.group_norm is not None:
517 |             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
518 | 
519 |         query = attn.to_q(hidden_states)
520 | 
521 |         if encoder_hidden_states is None:
522 |             encoder_hidden_states = hidden_states
523 |         else:
524 |             end_pos = encoder_hidden_states.shape[1] - self.num_tokens
525 |             encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
526 |             if attn.norm_cross:
527 |                 encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
528 | 
529 |         key = attn.to_k(encoder_hidden_states)
530 |         value = attn.to_v(encoder_hidden_states)
531 | 
532 |         inner_dim = key.shape[-1]
533 |         head_dim = inner_dim // attn.heads
534 | 
535 |         query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
536 | 
537 |         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
538 |         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
539 | 
540 |         # the output of sdp = (batch, num_heads, seq_len, head_dim)
541 |         # TODO: add support for attn.scale when we move to Torch 2.1
542 |         hidden_states = F.scaled_dot_product_attention(
543 |             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
544 |         )
545 | 
546 |         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
547 |         hidden_states = hidden_states.to(query.dtype)
548 | 
549 |         # linear proj
550 |         hidden_states = attn.to_out[0](hidden_states)
551 |         # dropout
552 |         hidden_states = attn.to_out[1](hidden_states)
553 | 
554 |         if input_ndim == 4:
555 |             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
556 | 
557 |         if attn.residual_connection:
558 |             hidden_states = hidden_states + residual
559 | 
560 |         hidden_states = hidden_states / attn.rescale_output_factor
561 | 
562 |         return hidden_states
563 | 


--------------------------------------------------------------------------------
/ip_adapter/ip_adapter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List
  3 | 
  4 | import torch
  5 | from diffusers import StableDiffusionPipeline
  6 | from diffusers.pipelines.controlnet import MultiControlNetModel
  7 | from PIL import Image
  8 | from safetensors import safe_open
  9 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 10 | 
 11 | from .utils import is_torch2_available, get_generator
 12 | from comfy.model_management import cleanup_models
 13 | if is_torch2_available():
 14 |     from .attention_processor import (
 15 |         AttnProcessor2_0 as AttnProcessor,
 16 |     )
 17 |     from .attention_processor import (
 18 |         CNAttnProcessor2_0 as CNAttnProcessor,
 19 |     )
 20 |     from .attention_processor import (
 21 |         IPAttnProcessor2_0 as IPAttnProcessor,
 22 |     )
 23 | else:
 24 |     from .attention_processor import AttnProcessor, CNAttnProcessor, IPAttnProcessor
 25 | from .resampler import Resampler
 26 | 
 27 | 
 28 | class ImageProjModel(torch.nn.Module):
 29 |     """Projection Model"""
 30 | 
 31 |     def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
 32 |         super().__init__()
 33 | 
 34 |         self.generator = None
 35 |         self.cross_attention_dim = cross_attention_dim
 36 |         self.clip_extra_context_tokens = clip_extra_context_tokens
 37 |         self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
 38 |         self.norm = torch.nn.LayerNorm(cross_attention_dim)
 39 | 
 40 |     def forward(self, image_embeds):
 41 |         embeds = image_embeds
 42 |         clip_extra_context_tokens = self.proj(embeds).reshape(
 43 |             -1, self.clip_extra_context_tokens, self.cross_attention_dim
 44 |         )
 45 |         clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
 46 |         return clip_extra_context_tokens
 47 | 
 48 | 
 49 | class MLPProjModel(torch.nn.Module):
 50 |     """SD model with image prompt"""
 51 |     def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024):
 52 |         super().__init__()
 53 |         
 54 |         self.proj = torch.nn.Sequential(
 55 |             torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim),
 56 |             torch.nn.GELU(),
 57 |             torch.nn.Linear(clip_embeddings_dim, cross_attention_dim),
 58 |             torch.nn.LayerNorm(cross_attention_dim)
 59 |         )
 60 |         
 61 |     def forward(self, image_embeds):
 62 |         clip_extra_context_tokens = self.proj(image_embeds)
 63 |         return clip_extra_context_tokens
 64 | 
 65 | 
 66 | class IPAdapter:
 67 |     def __init__(self, sd_pipe, image_encoder, ip_ckpt, device,image_encoder_config, num_tokens=4, target_blocks=["block"]):
 68 |         self.device = device
 69 |         # self.image_encoder_path = image_encoder_path
 70 |         self.ip_ckpt = ip_ckpt
 71 |         self.num_tokens = num_tokens
 72 |         self.target_blocks = target_blocks
 73 |         self.image_encoder_config=image_encoder_config
 74 |         self.pipe = sd_pipe.to(self.device)
 75 |         self.set_ip_adapter()
 76 | 
 77 |         # load image encoder
 78 |         # self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
 79 |         #     self.device, dtype=torch.float16
 80 |         # )
 81 |         self.image_encoder = image_encoder.encode_image
 82 |         
 83 |         self.clip_image_processor = CLIPImageProcessor()
 84 |         # image proj model
 85 |         self.image_proj_model = self.init_proj()
 86 | 
 87 |         self.load_ip_adapter()
 88 | 
 89 |     def init_proj(self):
 90 |         image_proj_model = ImageProjModel(
 91 |             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
 92 |             clip_embeddings_dim=self.image_encoder_config["projection_dim"],
 93 |             clip_extra_context_tokens=self.num_tokens,
 94 |         ).to(self.device, dtype=torch.float16)
 95 |         return image_proj_model
 96 | 
 97 |     def set_ip_adapter(self):
 98 |         unet = self.pipe.unet
 99 |         attn_procs = {}
100 |         for name in unet.attn_processors.keys():
101 |             cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
102 |             if name.startswith("mid_block"):
103 |                 hidden_size = unet.config.block_out_channels[-1]
104 |             elif name.startswith("up_blocks"):
105 |                 block_id = int(name[len("up_blocks.")])
106 |                 hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
107 |             elif name.startswith("down_blocks"):
108 |                 block_id = int(name[len("down_blocks.")])
109 |                 hidden_size = unet.config.block_out_channels[block_id]
110 |             if cross_attention_dim is None:
111 |                 attn_procs[name] = AttnProcessor()
112 |             else:
113 |                 selected = False
114 |                 for block_name in self.target_blocks:
115 |                     if block_name in name:
116 |                         selected = True
117 |                         break
118 |                 if selected:
119 |                     attn_procs[name] = IPAttnProcessor(
120 |                         hidden_size=hidden_size,
121 |                         cross_attention_dim=cross_attention_dim,
122 |                         scale=1.0,
123 |                         num_tokens=self.num_tokens,
124 |                     ).to(self.device, dtype=torch.float16)
125 |                 else:
126 |                     attn_procs[name] = IPAttnProcessor(
127 |                         hidden_size=hidden_size,
128 |                         cross_attention_dim=cross_attention_dim,
129 |                         scale=1.0,
130 |                         num_tokens=self.num_tokens,
131 |                         skip=True
132 |                     ).to(self.device, dtype=torch.float16)
133 |         unet.set_attn_processor(attn_procs)
134 |         if hasattr(self.pipe, "controlnet"):
135 |             if isinstance(self.pipe.controlnet, MultiControlNetModel):
136 |                 for controlnet in self.pipe.controlnet.nets:
137 |                     controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
138 |             else:
139 |                 self.pipe.controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
140 | 
141 |     def load_ip_adapter(self):
142 |         if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
143 |             state_dict = {"image_proj": {}, "ip_adapter": {}}
144 |             with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
145 |                 for key in f.keys():
146 |                     if key.startswith("image_proj."):
147 |                         state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
148 |                     elif key.startswith("ip_adapter."):
149 |                         state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
150 |         else:
151 |             state_dict = torch.load(self.ip_ckpt, map_location="cpu")
152 |         self.image_proj_model.load_state_dict(state_dict["image_proj"])
153 |         ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
154 |         ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
155 | 
156 |     @torch.inference_mode()
157 |     def get_image_embeds(self, pil_image=None, clip_image_embeds=None, content_prompt_embeds=None):
158 |         
159 |         if isinstance(pil_image, torch.Tensor):
160 | 
161 |             clip_image_embeds = self.image_encoder(pil_image)["image_embeds"]
162 |             clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16)
163 |             del self.image_encoder
164 |             cleanup_models(keep_clone_weights_loaded=False)
165 |         else:
166 |             clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
167 |             clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16)
168 |         if content_prompt_embeds is not None:
169 |             clip_image_embeds = clip_image_embeds - content_prompt_embeds
170 |             clip_image_embeds = clip_image_embeds.clone().detach().to(self.device, dtype=torch.float16)
171 |         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
172 |         uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
173 |         return image_prompt_embeds, uncond_image_prompt_embeds
174 | 
175 |     def set_scale(self, scale):
176 |         for attn_processor in self.pipe.unet.attn_processors.values():
177 |             if isinstance(attn_processor, IPAttnProcessor):
178 |                 attn_processor.scale = scale
179 | 
180 |     def generate(
181 |         self,
182 |         pil_image=None,
183 |         clip_image_embeds=None,
184 |         prompt=None,
185 |         negative_prompt=None,
186 |         scale=1.0,
187 |         num_samples=4,
188 |         seed=None,
189 |         guidance_scale=7.5,
190 |         num_inference_steps=30,
191 |         neg_content_emb=None,
192 |         **kwargs,
193 |     ):
194 |         self.set_scale(scale)
195 |         d1, _, _, _ = pil_image.size()
196 |         if isinstance(pil_image,torch.Tensor) :
197 |             num_prompts = d1
198 |         else:
199 |             num_prompts = clip_image_embeds.size(0)
200 | 
201 |         if prompt is None:
202 |             prompt = "best quality, high quality"
203 |         if negative_prompt is None:
204 |             negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
205 | 
206 |         if not isinstance(prompt, List):
207 |             prompt = [prompt] * num_prompts
208 |         if not isinstance(negative_prompt, List):
209 |             negative_prompt = [negative_prompt] * num_prompts
210 | 
211 |         image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(
212 |             pil_image=pil_image, clip_image_embeds=clip_image_embeds, content_prompt_embeds=neg_content_emb
213 |         )
214 |         bs_embed, seq_len, _ = image_prompt_embeds.shape
215 |         image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
216 |         image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
217 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
218 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
219 | 
220 |         with torch.inference_mode():
221 |             prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
222 |                 prompt,
223 |                 device=self.device,
224 |                 num_images_per_prompt=num_samples,
225 |                 do_classifier_free_guidance=True,
226 |                 negative_prompt=negative_prompt,
227 |             )
228 |             prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
229 |             negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
230 | 
231 |         generator = get_generator(seed, self.device)
232 | 
233 |         images = self.pipe(
234 |             prompt_embeds=prompt_embeds,
235 |             negative_prompt_embeds=negative_prompt_embeds,
236 |             guidance_scale=guidance_scale,
237 |             num_inference_steps=num_inference_steps,
238 |             generator=generator,
239 |             **kwargs,
240 |         ).images
241 | 
242 |         return images
243 | 
244 | 
245 | class IPAdapterXL(IPAdapter):
246 |     """SDXL"""
247 | 
248 |     def generate(
249 |         self,
250 |         pil_image,
251 |         prompt=None,
252 |         negative_prompt=None,
253 |         scale=1.0,
254 |         num_samples=4,
255 |         seed=None,
256 |         num_inference_steps=30,
257 |         neg_content_emb=None,
258 |         neg_content_prompt=None,
259 |         neg_content_scale=1.0,
260 |         **kwargs,
261 |     ):
262 |         self.set_scale(scale)
263 |         d1,_,_,_=pil_image.size()
264 |         num_prompts = d1
265 | 
266 |         if prompt is None:
267 |             prompt = "best quality, high quality"
268 |         if negative_prompt is None:
269 |             negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
270 | 
271 |         if not isinstance(prompt, List):
272 |             prompt = [prompt] * num_prompts
273 |         if not isinstance(negative_prompt, List):
274 |             negative_prompt = [negative_prompt] * num_prompts
275 |         
276 |         if neg_content_emb is None:
277 |             if neg_content_prompt is not None:
278 |                 with torch.inference_mode():
279 |                     (
280 |                         prompt_embeds_, # torch.Size([1, 77, 2048])
281 |                         negative_prompt_embeds_,
282 |                         pooled_prompt_embeds_, # torch.Size([1, 1280])
283 |                         negative_pooled_prompt_embeds_,
284 |                     ) = self.pipe.encode_prompt(
285 |                         neg_content_prompt,
286 |                         num_images_per_prompt=num_samples,
287 |                         do_classifier_free_guidance=True,
288 |                         negative_prompt=negative_prompt,
289 |                     )
290 |                     pooled_prompt_embeds_ *= neg_content_scale
291 |             else:
292 |                 pooled_prompt_embeds_ = neg_content_emb
293 |         else:
294 |             pooled_prompt_embeds_ = None
295 | 
296 |         image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image, content_prompt_embeds=pooled_prompt_embeds_)
297 |         bs_embed, seq_len, _ = image_prompt_embeds.shape
298 |         image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
299 |         image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
300 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
301 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
302 | 
303 |         with torch.inference_mode():
304 |             (
305 |                 prompt_embeds,
306 |                 negative_prompt_embeds,
307 |                 pooled_prompt_embeds,
308 |                 negative_pooled_prompt_embeds,
309 |             ) = self.pipe.encode_prompt(
310 |                 prompt,
311 |                 num_images_per_prompt=num_samples,
312 |                 do_classifier_free_guidance=True,
313 |                 negative_prompt=negative_prompt,
314 |             )
315 |             prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
316 |             negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
317 | 
318 |         self.generator = get_generator(seed, self.device)
319 |         
320 |         cleanup_models(keep_clone_weights_loaded=False)
321 |         images = self.pipe(
322 |             prompt_embeds=prompt_embeds,
323 |             negative_prompt_embeds=negative_prompt_embeds,
324 |             pooled_prompt_embeds=pooled_prompt_embeds,
325 |             negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
326 |             num_inference_steps=num_inference_steps,
327 |             generator=self.generator,
328 |             **kwargs,
329 |         ).images
330 | 
331 |         return images
332 | 
333 | 
334 | class IPAdapterPlus(IPAdapter):
335 |     """IP-Adapter with fine-grained features"""
336 | 
337 |     def init_proj(self):
338 |         image_proj_model = Resampler(
339 |             dim=self.pipe.unet.config.cross_attention_dim,
340 |             depth=4,
341 |             dim_head=64,
342 |             heads=12,
343 |             num_queries=self.num_tokens,
344 |             embedding_dim=self.image_encoder.config.hidden_size,
345 |             output_dim=self.pipe.unet.config.cross_attention_dim,
346 |             ff_mult=4,
347 |         ).to(self.device, dtype=torch.float16)
348 |         return image_proj_model
349 | 
350 |     @torch.inference_mode()
351 |     def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
352 |         if isinstance(pil_image, Image.Image):
353 |             pil_image = [pil_image]
354 |         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
355 |         clip_image = clip_image.to(self.device, dtype=torch.float16)
356 |         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
357 |         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
358 |         uncond_clip_image_embeds = self.image_encoder(
359 |             torch.zeros_like(clip_image), output_hidden_states=True
360 |         ).hidden_states[-2]
361 |         uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
362 |         return image_prompt_embeds, uncond_image_prompt_embeds
363 | 
364 | 
365 | class IPAdapterFull(IPAdapterPlus):
366 |     """IP-Adapter with full features"""
367 | 
368 |     def init_proj(self):
369 |         image_proj_model = MLPProjModel(
370 |             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
371 |             clip_embeddings_dim=self.image_encoder.config.hidden_size,
372 |         ).to(self.device, dtype=torch.float16)
373 |         return image_proj_model
374 | 
375 | 
376 | class IPAdapterPlusXL(IPAdapter):
377 |     """SDXL"""
378 | 
379 |     def init_proj(self):
380 |         image_proj_model = Resampler(
381 |             dim=1280,
382 |             depth=4,
383 |             dim_head=64,
384 |             heads=20,
385 |             num_queries=self.num_tokens,
386 |             embedding_dim=self.image_encoder.config.hidden_size,
387 |             output_dim=self.pipe.unet.config.cross_attention_dim,
388 |             ff_mult=4,
389 |         ).to(self.device, dtype=torch.float16)
390 |         return image_proj_model
391 | 
392 |     @torch.inference_mode()
393 |     def get_image_embeds(self, pil_image):
394 |         if isinstance(pil_image, Image.Image):
395 |             pil_image = [pil_image]
396 |         clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
397 |         clip_image = clip_image.to(self.device, dtype=torch.float16)
398 |         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
399 |         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
400 |         uncond_clip_image_embeds = self.image_encoder(
401 |             torch.zeros_like(clip_image), output_hidden_states=True
402 |         ).hidden_states[-2]
403 |         uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
404 |         return image_prompt_embeds, uncond_image_prompt_embeds
405 | 
406 |     def generate(
407 |         self,
408 |         pil_image,
409 |         prompt=None,
410 |         negative_prompt=None,
411 |         scale=1.0,
412 |         num_samples=4,
413 |         seed=None,
414 |         num_inference_steps=30,
415 |         **kwargs,
416 |     ):
417 |         self.set_scale(scale)
418 | 
419 |         num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
420 | 
421 |         if prompt is None:
422 |             prompt = "best quality, high quality"
423 |         if negative_prompt is None:
424 |             negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
425 | 
426 |         if not isinstance(prompt, List):
427 |             prompt = [prompt] * num_prompts
428 |         if not isinstance(negative_prompt, List):
429 |             negative_prompt = [negative_prompt] * num_prompts
430 | 
431 |         image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
432 |         bs_embed, seq_len, _ = image_prompt_embeds.shape
433 |         image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
434 |         image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
435 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
436 |         uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
437 | 
438 |         with torch.inference_mode():
439 |             (
440 |                 prompt_embeds,
441 |                 negative_prompt_embeds,
442 |                 pooled_prompt_embeds,
443 |                 negative_pooled_prompt_embeds,
444 |             ) = self.pipe.encode_prompt(
445 |                 prompt,
446 |                 num_images_per_prompt=num_samples,
447 |                 do_classifier_free_guidance=True,
448 |                 negative_prompt=negative_prompt,
449 |             )
450 |             prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
451 |             negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
452 | 
453 |         generator = get_generator(seed, self.device)
454 | 
455 |         images = self.pipe(
456 |             prompt_embeds=prompt_embeds,
457 |             negative_prompt_embeds=negative_prompt_embeds,
458 |             pooled_prompt_embeds=pooled_prompt_embeds,
459 |             negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
460 |             num_inference_steps=num_inference_steps,
461 |             generator=generator,
462 |             **kwargs,
463 |         ).images
464 | 
465 |         return images
466 | 


--------------------------------------------------------------------------------
/ip_adapter/resampler.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
  2 | # and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
  3 | 
  4 | import math
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from einops import rearrange
  9 | from einops.layers.torch import Rearrange
 10 | 
 11 | 
 12 | # FFN
 13 | def FeedForward(dim, mult=4):
 14 |     inner_dim = int(dim * mult)
 15 |     return nn.Sequential(
 16 |         nn.LayerNorm(dim),
 17 |         nn.Linear(dim, inner_dim, bias=False),
 18 |         nn.GELU(),
 19 |         nn.Linear(inner_dim, dim, bias=False),
 20 |     )
 21 | 
 22 | 
 23 | def reshape_tensor(x, heads):
 24 |     bs, length, width = x.shape
 25 |     # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
 26 |     x = x.view(bs, length, heads, -1)
 27 |     # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
 28 |     x = x.transpose(1, 2)
 29 |     # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
 30 |     x = x.reshape(bs, heads, length, -1)
 31 |     return x
 32 | 
 33 | 
 34 | class PerceiverAttention(nn.Module):
 35 |     def __init__(self, *, dim, dim_head=64, heads=8):
 36 |         super().__init__()
 37 |         self.scale = dim_head**-0.5
 38 |         self.dim_head = dim_head
 39 |         self.heads = heads
 40 |         inner_dim = dim_head * heads
 41 | 
 42 |         self.norm1 = nn.LayerNorm(dim)
 43 |         self.norm2 = nn.LayerNorm(dim)
 44 | 
 45 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 46 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 47 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 48 | 
 49 |     def forward(self, x, latents):
 50 |         """
 51 |         Args:
 52 |             x (torch.Tensor): image features
 53 |                 shape (b, n1, D)
 54 |             latent (torch.Tensor): latent features
 55 |                 shape (b, n2, D)
 56 |         """
 57 |         x = self.norm1(x)
 58 |         latents = self.norm2(latents)
 59 | 
 60 |         b, l, _ = latents.shape
 61 | 
 62 |         q = self.to_q(latents)
 63 |         kv_input = torch.cat((x, latents), dim=-2)
 64 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
 65 | 
 66 |         q = reshape_tensor(q, self.heads)
 67 |         k = reshape_tensor(k, self.heads)
 68 |         v = reshape_tensor(v, self.heads)
 69 | 
 70 |         # attention
 71 |         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
 72 |         weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
 73 |         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
 74 |         out = weight @ v
 75 | 
 76 |         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
 77 | 
 78 |         return self.to_out(out)
 79 | 
 80 | 
 81 | class Resampler(nn.Module):
 82 |     def __init__(
 83 |         self,
 84 |         dim=1024,
 85 |         depth=8,
 86 |         dim_head=64,
 87 |         heads=16,
 88 |         num_queries=8,
 89 |         embedding_dim=768,
 90 |         output_dim=1024,
 91 |         ff_mult=4,
 92 |         max_seq_len: int = 257,  # CLIP tokens + CLS token
 93 |         apply_pos_emb: bool = False,
 94 |         num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
 95 |     ):
 96 |         super().__init__()
 97 |         self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
 98 | 
 99 |         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
100 | 
101 |         self.proj_in = nn.Linear(embedding_dim, dim)
102 | 
103 |         self.proj_out = nn.Linear(dim, output_dim)
104 |         self.norm_out = nn.LayerNorm(output_dim)
105 | 
106 |         self.to_latents_from_mean_pooled_seq = (
107 |             nn.Sequential(
108 |                 nn.LayerNorm(dim),
109 |                 nn.Linear(dim, dim * num_latents_mean_pooled),
110 |                 Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
111 |             )
112 |             if num_latents_mean_pooled > 0
113 |             else None
114 |         )
115 | 
116 |         self.layers = nn.ModuleList([])
117 |         for _ in range(depth):
118 |             self.layers.append(
119 |                 nn.ModuleList(
120 |                     [
121 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
122 |                         FeedForward(dim=dim, mult=ff_mult),
123 |                     ]
124 |                 )
125 |             )
126 | 
127 |     def forward(self, x):
128 |         if self.pos_emb is not None:
129 |             n, device = x.shape[1], x.device
130 |             pos_emb = self.pos_emb(torch.arange(n, device=device))
131 |             x = x + pos_emb
132 | 
133 |         latents = self.latents.repeat(x.size(0), 1, 1)
134 | 
135 |         x = self.proj_in(x)
136 | 
137 |         if self.to_latents_from_mean_pooled_seq:
138 |             meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
139 |             meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
140 |             latents = torch.cat((meanpooled_latents, latents), dim=-2)
141 | 
142 |         for attn, ff in self.layers:
143 |             latents = attn(x, latents) + latents
144 |             latents = ff(latents) + latents
145 | 
146 |         latents = self.proj_out(latents)
147 |         return self.norm_out(latents)
148 | 
149 | 
150 | def masked_mean(t, *, dim, mask=None):
151 |     if mask is None:
152 |         return t.mean(dim=dim)
153 | 
154 |     denom = mask.sum(dim=dim, keepdim=True)
155 |     mask = rearrange(mask, "b n -> b n 1")
156 |     masked_t = t.masked_fill(~mask, 0.0)
157 | 
158 |     return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)
159 | 


--------------------------------------------------------------------------------
/ip_adapter/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from PIL import Image
 5 | 
 6 | attn_maps = {}
 7 | def hook_fn(name):
 8 |     def forward_hook(module, input, output):
 9 |         if hasattr(module.processor, "attn_map"):
10 |             attn_maps[name] = module.processor.attn_map
11 |             del module.processor.attn_map
12 | 
13 |     return forward_hook
14 | 
15 | def register_cross_attention_hook(unet):
16 |     for name, module in unet.named_modules():
17 |         if name.split('.')[-1].startswith('attn2'):
18 |             module.register_forward_hook(hook_fn(name))
19 | 
20 |     return unet
21 | 
22 | def upscale(attn_map, target_size):
23 |     attn_map = torch.mean(attn_map, dim=0)
24 |     attn_map = attn_map.permute(1,0)
25 |     temp_size = None
26 | 
27 |     for i in range(0,5):
28 |         scale = 2 ** i
29 |         if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64:
30 |             temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8))
31 |             break
32 | 
33 |     assert temp_size is not None, "temp_size cannot is None"
34 | 
35 |     attn_map = attn_map.view(attn_map.shape[0], *temp_size)
36 | 
37 |     attn_map = F.interpolate(
38 |         attn_map.unsqueeze(0).to(dtype=torch.float32),
39 |         size=target_size,
40 |         mode='bilinear',
41 |         align_corners=False
42 |     )[0]
43 | 
44 |     attn_map = torch.softmax(attn_map, dim=0)
45 |     return attn_map
46 | def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True):
47 | 
48 |     idx = 0 if instance_or_negative else 1
49 |     net_attn_maps = []
50 | 
51 |     for name, attn_map in attn_maps.items():
52 |         attn_map = attn_map.cpu() if detach else attn_map
53 |         attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze()
54 |         attn_map = upscale(attn_map, image_size) 
55 |         net_attn_maps.append(attn_map) 
56 | 
57 |     net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0)
58 | 
59 |     return net_attn_maps
60 | 
61 | def attnmaps2images(net_attn_maps):
62 | 
63 |     #total_attn_scores = 0
64 |     images = []
65 | 
66 |     for attn_map in net_attn_maps:
67 |         attn_map = attn_map.cpu().numpy()
68 |         #total_attn_scores += attn_map.mean().item()
69 | 
70 |         normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255
71 |         normalized_attn_map = normalized_attn_map.astype(np.uint8)
72 |         #print("norm: ", normalized_attn_map.shape)
73 |         image = Image.fromarray(normalized_attn_map)
74 | 
75 |         #image = fix_save_attn_map(attn_map)
76 |         images.append(image)
77 | 
78 |     #print(total_attn_scores)
79 |     return images
80 | def is_torch2_available():
81 |     return hasattr(F, "scaled_dot_product_attention")
82 | 
83 | def get_generator(seed, device):
84 | 
85 |     if seed is not None:
86 |         if isinstance(seed, list):
87 |             generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed]
88 |         else:
89 |             generator = torch.Generator(device).manual_seed(seed)
90 |     else:
91 |         generator = None
92 | 
93 |     return generator


--------------------------------------------------------------------------------
/model.yaml:
--------------------------------------------------------------------------------
 1 | lightning_unet:
 2 |       - sdxl_lightning_1step_unet_x0.safetensors     #repo    ByteDance/SDXL-Lightning
 3 |       - sdxl_lightning_2step_unet.safetensors        #repo    ByteDance/SDXL-Lightning
 4 |       - sdxl_lightning_4step_unet.safetensors
 5 |       - sdxl_lightning_8step_unet.safetensors        
 6 |       - Hyper-SDXL-1step-Unet.safetensors            #repo    ByteDance/Hyper-SD
 7 |       - lcm-sdxl-base-1.0.safetensors                #repo    ckpt/lcm-sdxl-unet    you can change model name like example
 8 |       - dmd2_sdxl_1step_unet_fp16.bin                #repo    tianweiy/DMD2
 9 |       - dmd2_sdxl_4step_unet_fp16.bin
10 | surport_model:                                                        
11 |       - stable-diffusion-v1-5                        #repo    runwayml/stable-diffusion-v1-5
12 |       - stable-diffusion-2-1-base                    #repo    stabilityai/stable-diffusion-2-1-base
13 |       - playground-v2-1024px-aesthetic               #repo    playgroundai/playground-v2-1024px-aesthetic
14 |       - Ghibli-Diffusion                             #repo    nitrosocke/Ghibli-Diffusion
15 | surport_controlnet:
16 |       - controlnet-canny-sdxl-1.0                    #repo   diffusers/controlnet-canny-sdxl-1.0
17 |       - MistoLine                                    #repo   TheMistoAI/MistoLine
18 |       - controlnet-openpose-sdxl-1.0
19 |       - controlnet-scribble-sdxl-1.0
20 |       - stable-diffusion-xl-1.0-inpainting-0.1
21 |       - controlnet-tile-sdxl-1.0
22 | sdxl_model:
23 |       - stable-diffusion-xl-base-1.0                 #repo    stabilityai/stable-diffusion-xl-base-1.0
24 |       - sdxl-flash                                   #repo    sd-community/sdxl-flash
25 | lightning_lora:
26 |       - Hyper-SD15-12steps-CFG-lora.safetensors
27 |       - Hyper-SD15-1step-lora.safetensors
28 |       - Hyper-SD15-2steps-lora.safetensors
29 |       - Hyper-SD15-4steps-lora.safetensors
30 |       - Hyper-SD15-8steps-CFG-lora.safetensors
31 |       - Hyper-SD15-8steps-lora.safetensors
32 |       - pcm_sd15_lcmlike_lora_converted.safetensors
33 |       - pcm_sd15_normalcfg_16step_converted.safetensors
34 |       - pcm_sd15_normalcfg_4step_converted.safetensors
35 |       - pcm_sd15_smallcfg_16step_converted.safetensor
36 |       - pcm_sd15_smallcfg_2step_converted.safetensors
37 |       - pcm_sd15_smallcfg_4step_converted.safetensors
38 |       - pcm_sd15_smallcfg_8step_converted.safetensors
39 |       - lcm-lora-sdv1-5.safetensors
40 |       - TCD-SD15-LoRA.safetensors            #need rename and TCD
41 | lightning_xl_lora:
42 |       - Hyper-SDXL-12steps-CFG-lora.safetensors
43 |       - Hyper-SDXL-1step-lora.safetensors
44 |       - Hyper-SDXL-2step-lora.safetensors
45 |       - Hyper-SDXL-4step-lora.safetensors
46 |       - Hyper-SDXL-8step-lora.safetensors
47 |       - Hyper-SDXL-8steps-CFG-lora.safetensors
48 |       - sdxl_lightning_2step_lora.safetensors
49 |       - sdxl_lightning_4step_lora.safetensors
50 |       - sdxl_lightning_8step_lora.safetensors
51 |       - pcm_sdxl_lcmlike_lora_converted.safetensors
52 |       - pcm_sdxl_normalcfg_16step_converted.safetensors
53 |       - pcm_sdxl_normalcfg_4step_converted.safetensors
54 |       - pcm_sdxl_normalcfg_8step_converted.safetensors
55 |       - pcm_sdxl_smallcfg_16step_converted.safetensors
56 |       - pcm_sdxl_smallcfg_2step_converted.safetensors
57 |       - pcm_sdxl_smallcfg_4step_converted.safetensors
58 |       - pcm_sdxl_smallcfg_8step_converted.safetensors
59 |       - lcm-lora-sdxl.safetensors
60 |       - dmd2_sdxl_4step_lora.safetensors
61 |       - dmd2_sdxl_4step_lora_fp16.safetensors
62 |       - TCD-SDXL-LoRA.safetensors                         #need rename and TCD
63 |       - manne_turbo.safetensors                      
64 | 
65 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui_hidiffusion_pro"
 3 | description = "A HiDiffusion node for ComfyUI."
 4 | version = "1.0.1"
 5 | license = { file = "LICENSE" }
 6 | 
 7 | [project.urls]
 8 | Repository = "https://github.com/smthemex/ComfyUI_HiDiffusion_Pro"
 9 | #  Used by Comfy Registry https://comfyregistry.org
10 | 
11 | [tool.comfy]
12 | PublisherId = "smthemex"
13 | DisplayName = "ComfyUI_HiDiffusion_Pro"
14 | Icon = ""
15 | 


--------------------------------------------------------------------------------
/sd15_config/feature_extractor/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": {
 3 |     "height": 224,
 4 |     "width": 224
 5 |   },
 6 |   "do_center_crop": true,
 7 |   "do_convert_rgb": true,
 8 |   "do_normalize": true,
 9 |   "do_rescale": true,
10 |   "do_resize": true,
11 |   "feature_extractor_type": "CLIPFeatureExtractor",
12 |   "image_mean": [
13 |     0.48145466,
14 |     0.4578275,
15 |     0.40821073
16 |   ],
17 |   "image_processor_type": "CLIPFeatureExtractor",
18 |   "image_std": [
19 |     0.26862954,
20 |     0.26130258,
21 |     0.27577711
22 |   ],
23 |   "resample": 3,
24 |   "rescale_factor": 0.00392156862745098,
25 |   "size": {
26 |     "shortest_edge": 224
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/sd15_config/model_index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "StableDiffusionPipeline",
 3 |   "_diffusers_version": "0.21.0.dev0",
 4 |   "_name_or_path": "lykon-models/dreamshaper-8",
 5 |   "feature_extractor": [
 6 |     "transformers",
 7 |     "CLIPFeatureExtractor"
 8 |   ],
 9 |   "requires_safety_checker": true,
10 |   "safety_checker": [
11 |     "stable_diffusion",
12 |     "StableDiffusionSafetyChecker"
13 |   ],
14 |   "scheduler": [
15 |     "diffusers",
16 |     "DEISMultistepScheduler"
17 |   ],
18 |   "text_encoder": [
19 |     "transformers",
20 |     "CLIPTextModel"
21 |   ],
22 |   "tokenizer": [
23 |     "transformers",
24 |     "CLIPTokenizer"
25 |   ],
26 |   "unet": [
27 |     "diffusers",
28 |     "UNet2DConditionModel"
29 |   ],
30 |   "vae": [
31 |     "diffusers",
32 |     "AutoencoderKL"
33 |   ]
34 | }
35 | 


--------------------------------------------------------------------------------
/sd15_config/safety_checker/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/safety_checker",
 3 |   "architectures": [
 4 |     "StableDiffusionSafetyChecker"
 5 |   ],
 6 |   "initializer_factor": 1.0,
 7 |   "logit_scale_init_value": 2.6592,
 8 |   "model_type": "clip",
 9 |   "projection_dim": 768,
10 |   "text_config": {
11 |     "dropout": 0.0,
12 |     "hidden_size": 768,
13 |     "intermediate_size": 3072,
14 |     "model_type": "clip_text_model",
15 |     "num_attention_heads": 12
16 |   },
17 |   "torch_dtype": "float16",
18 |   "transformers_version": "4.33.0.dev0",
19 |   "vision_config": {
20 |     "dropout": 0.0,
21 |     "hidden_size": 1024,
22 |     "intermediate_size": 4096,
23 |     "model_type": "clip_vision_model",
24 |     "num_attention_heads": 16,
25 |     "num_hidden_layers": 24,
26 |     "patch_size": 14
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/sd15_config/scheduler/scheduler_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "DEISMultistepScheduler",
 3 |   "_diffusers_version": "0.21.0.dev0",
 4 |   "algorithm_type": "deis",
 5 |   "beta_end": 0.012,
 6 |   "beta_schedule": "scaled_linear",
 7 |   "beta_start": 0.00085,
 8 |   "clip_sample": false,
 9 |   "dynamic_thresholding_ratio": 0.995,
10 |   "lower_order_final": true,
11 |   "num_train_timesteps": 1000,
12 |   "prediction_type": "epsilon",
13 |   "sample_max_value": 1.0,
14 |   "set_alpha_to_one": false,
15 |   "skip_prk_steps": true,
16 |   "solver_order": 2,
17 |   "solver_type": "logrho",
18 |   "steps_offset": 1,
19 |   "thresholding": false,
20 |   "timestep_spacing": "leading",
21 |   "trained_betas": null,
22 |   "use_karras_sigmas": false
23 | }
24 | 


--------------------------------------------------------------------------------
/sd15_config/text_encoder/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/text_encoder",
 3 |   "architectures": [
 4 |     "CLIPTextModel"
 5 |   ],
 6 |   "attention_dropout": 0.0,
 7 |   "bos_token_id": 0,
 8 |   "dropout": 0.0,
 9 |   "eos_token_id": 2,
10 |   "hidden_act": "quick_gelu",
11 |   "hidden_size": 768,
12 |   "initializer_factor": 1.0,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 3072,
15 |   "layer_norm_eps": 1e-05,
16 |   "max_position_embeddings": 77,
17 |   "model_type": "clip_text_model",
18 |   "num_attention_heads": 12,
19 |   "num_hidden_layers": 12,
20 |   "pad_token_id": 1,
21 |   "projection_dim": 768,
22 |   "torch_dtype": "float16",
23 |   "transformers_version": "4.33.0.dev0",
24 |   "vocab_size": 49408
25 | }
26 | 


--------------------------------------------------------------------------------
/sd15_config/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": {
 3 |     "content": "<|startoftext|>",
 4 |     "lstrip": false,
 5 |     "normalized": true,
 6 |     "rstrip": false,
 7 |     "single_word": false
 8 |   },
 9 |   "eos_token": {
10 |     "content": "<|endoftext|>",
11 |     "lstrip": false,
12 |     "normalized": true,
13 |     "rstrip": false,
14 |     "single_word": false
15 |   },
16 |   "pad_token": "<|endoftext|>",
17 |   "unk_token": {
18 |     "content": "<|endoftext|>",
19 |     "lstrip": false,
20 |     "normalized": true,
21 |     "rstrip": false,
22 |     "single_word": false
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/sd15_config/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "bos_token": {
 4 |     "__type": "AddedToken",
 5 |     "content": "<|startoftext|>",
 6 |     "lstrip": false,
 7 |     "normalized": true,
 8 |     "rstrip": false,
 9 |     "single_word": false
10 |   },
11 |   "clean_up_tokenization_spaces": true,
12 |   "do_lower_case": true,
13 |   "eos_token": {
14 |     "__type": "AddedToken",
15 |     "content": "<|endoftext|>",
16 |     "lstrip": false,
17 |     "normalized": true,
18 |     "rstrip": false,
19 |     "single_word": false
20 |   },
21 |   "errors": "replace",
22 |   "model_max_length": 77,
23 |   "pad_token": "<|endoftext|>",
24 |   "tokenizer_class": "CLIPTokenizer",
25 |   "unk_token": {
26 |     "__type": "AddedToken",
27 |     "content": "<|endoftext|>",
28 |     "lstrip": false,
29 |     "normalized": true,
30 |     "rstrip": false,
31 |     "single_word": false
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/sd15_config/unet/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "UNet2DConditionModel",
 3 |   "_diffusers_version": "0.21.0.dev0",
 4 |   "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/unet",
 5 |   "act_fn": "silu",
 6 |   "addition_embed_type": null,
 7 |   "addition_embed_type_num_heads": 64,
 8 |   "addition_time_embed_dim": null,
 9 |   "attention_head_dim": 8,
10 |   "attention_type": "default",
11 |   "block_out_channels": [
12 |     320,
13 |     640,
14 |     1280,
15 |     1280
16 |   ],
17 |   "center_input_sample": false,
18 |   "class_embed_type": null,
19 |   "class_embeddings_concat": false,
20 |   "conv_in_kernel": 3,
21 |   "conv_out_kernel": 3,
22 |   "cross_attention_dim": 768,
23 |   "cross_attention_norm": null,
24 |   "down_block_types": [
25 |     "CrossAttnDownBlock2D",
26 |     "CrossAttnDownBlock2D",
27 |     "CrossAttnDownBlock2D",
28 |     "DownBlock2D"
29 |   ],
30 |   "downsample_padding": 1,
31 |   "dual_cross_attention": false,
32 |   "encoder_hid_dim": null,
33 |   "encoder_hid_dim_type": null,
34 |   "flip_sin_to_cos": true,
35 |   "freq_shift": 0,
36 |   "in_channels": 4,
37 |   "layers_per_block": 2,
38 |   "mid_block_only_cross_attention": null,
39 |   "mid_block_scale_factor": 1,
40 |   "mid_block_type": "UNetMidBlock2DCrossAttn",
41 |   "norm_eps": 1e-05,
42 |   "norm_num_groups": 32,
43 |   "num_attention_heads": null,
44 |   "num_class_embeds": null,
45 |   "only_cross_attention": false,
46 |   "out_channels": 4,
47 |   "projection_class_embeddings_input_dim": null,
48 |   "resnet_out_scale_factor": 1.0,
49 |   "resnet_skip_time_act": false,
50 |   "resnet_time_scale_shift": "default",
51 |   "sample_size": 64,
52 |   "time_cond_proj_dim": null,
53 |   "time_embedding_act_fn": null,
54 |   "time_embedding_dim": null,
55 |   "time_embedding_type": "positional",
56 |   "timestep_post_act": null,
57 |   "transformer_layers_per_block": 1,
58 |   "up_block_types": [
59 |     "UpBlock2D",
60 |     "CrossAttnUpBlock2D",
61 |     "CrossAttnUpBlock2D",
62 |     "CrossAttnUpBlock2D"
63 |   ],
64 |   "upcast_attention": null,
65 |   "use_linear_projection": false
66 | }
67 | 


--------------------------------------------------------------------------------
/sd15_config/vae/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.21.0.dev0",
 4 |   "_name_or_path": "/home/patrick/.cache/huggingface/hub/models--lykon-models--dreamshaper-8/snapshots/7e855e3f481832419503d1fa18d4a4379597f04b/vae",
 5 |   "act_fn": "silu",
 6 |   "block_out_channels": [
 7 |     128,
 8 |     256,
 9 |     512,
10 |     512
11 |   ],
12 |   "down_block_types": [
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D",
16 |     "DownEncoderBlock2D"
17 |   ],
18 |   "force_upcast": true,
19 |   "in_channels": 3,
20 |   "latent_channels": 4,
21 |   "layers_per_block": 2,
22 |   "norm_num_groups": 32,
23 |   "out_channels": 3,
24 |   "sample_size": 512,
25 |   "scaling_factor": 0.18215,
26 |   "up_block_types": [
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D",
29 |     "UpDecoderBlock2D",
30 |     "UpDecoderBlock2D"
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/sdxl_config/model_index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "StableDiffusionXLPipeline",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "force_zeros_for_empty_prompt": true,
 5 |   "add_watermarker": null,
 6 |   "scheduler": [
 7 |     "diffusers",
 8 |     "EulerDiscreteScheduler"
 9 |   ],
10 |   "text_encoder": [
11 |     "transformers",
12 |     "CLIPTextModel"
13 |   ],
14 |   "text_encoder_2": [
15 |     "transformers",
16 |     "CLIPTextModelWithProjection"
17 |   ],
18 |   "tokenizer": [
19 |     "transformers",
20 |     "CLIPTokenizer"
21 |   ],
22 |   "tokenizer_2": [
23 |     "transformers",
24 |     "CLIPTokenizer"
25 |   ],
26 |   "unet": [
27 |     "diffusers",
28 |     "UNet2DConditionModel"
29 |   ],
30 |   "vae": [
31 |     "diffusers",
32 |     "AutoencoderKL"
33 |   ]
34 | }
35 | 


--------------------------------------------------------------------------------
/sdxl_config/scheduler/scheduler_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "EulerDiscreteScheduler",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "beta_end": 0.012,
 5 |   "beta_schedule": "scaled_linear",
 6 |   "beta_start": 0.00085,
 7 |   "clip_sample": false,
 8 |   "interpolation_type": "linear",
 9 |   "num_train_timesteps": 1000,
10 |   "prediction_type": "epsilon",
11 |   "sample_max_value": 1.0,
12 |   "set_alpha_to_one": false,
13 |   "skip_prk_steps": true,
14 |   "steps_offset": 1,
15 |   "timestep_spacing": "leading",
16 |   "trained_betas": null,
17 |   "use_karras_sigmas": false
18 | }
19 | 


--------------------------------------------------------------------------------
/sdxl_config/text_encoder/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "CLIPTextModel"
 4 |   ],
 5 |   "attention_dropout": 0.0,
 6 |   "bos_token_id": 0,
 7 |   "dropout": 0.0,
 8 |   "eos_token_id": 2,
 9 |   "hidden_act": "quick_gelu",
10 |   "hidden_size": 768,
11 |   "initializer_factor": 1.0,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 3072,
14 |   "layer_norm_eps": 1e-05,
15 |   "max_position_embeddings": 77,
16 |   "model_type": "clip_text_model",
17 |   "num_attention_heads": 12,
18 |   "num_hidden_layers": 12,
19 |   "pad_token_id": 1,
20 |   "projection_dim": 768,
21 |   "torch_dtype": "float16",
22 |   "transformers_version": "4.32.0.dev0",
23 |   "vocab_size": 49408
24 | }
25 | 


--------------------------------------------------------------------------------
/sdxl_config/text_encoder_2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "CLIPTextModelWithProjection"
 4 |   ],
 5 |   "attention_dropout": 0.0,
 6 |   "bos_token_id": 0,
 7 |   "dropout": 0.0,
 8 |   "eos_token_id": 2,
 9 |   "hidden_act": "gelu",
10 |   "hidden_size": 1280,
11 |   "initializer_factor": 1.0,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 5120,
14 |   "layer_norm_eps": 1e-05,
15 |   "max_position_embeddings": 77,
16 |   "model_type": "clip_text_model",
17 |   "num_attention_heads": 20,
18 |   "num_hidden_layers": 32,
19 |   "pad_token_id": 1,
20 |   "projection_dim": 1280,
21 |   "torch_dtype": "float16",
22 |   "transformers_version": "4.32.0.dev0",
23 |   "vocab_size": 49408
24 | }
25 | 


--------------------------------------------------------------------------------
/sdxl_config/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": {
 3 |     "content": "<|startoftext|>",
 4 |     "lstrip": false,
 5 |     "normalized": true,
 6 |     "rstrip": false,
 7 |     "single_word": false
 8 |   },
 9 |   "eos_token": {
10 |     "content": "<|endoftext|>",
11 |     "lstrip": false,
12 |     "normalized": true,
13 |     "rstrip": false,
14 |     "single_word": false
15 |   },
16 |   "pad_token": "<|endoftext|>",
17 |   "unk_token": {
18 |     "content": "<|endoftext|>",
19 |     "lstrip": false,
20 |     "normalized": true,
21 |     "rstrip": false,
22 |     "single_word": false
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/sdxl_config/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "bos_token": {
 4 |     "__type": "AddedToken",
 5 |     "content": "<|startoftext|>",
 6 |     "lstrip": false,
 7 |     "normalized": true,
 8 |     "rstrip": false,
 9 |     "single_word": false
10 |   },
11 |   "clean_up_tokenization_spaces": true,
12 |   "do_lower_case": true,
13 |   "eos_token": {
14 |     "__type": "AddedToken",
15 |     "content": "<|endoftext|>",
16 |     "lstrip": false,
17 |     "normalized": true,
18 |     "rstrip": false,
19 |     "single_word": false
20 |   },
21 |   "errors": "replace",
22 |   "model_max_length": 77,
23 |   "pad_token": "<|endoftext|>",
24 |   "tokenizer_class": "CLIPTokenizer",
25 |   "unk_token": {
26 |     "__type": "AddedToken",
27 |     "content": "<|endoftext|>",
28 |     "lstrip": false,
29 |     "normalized": true,
30 |     "rstrip": false,
31 |     "single_word": false
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/sdxl_config/tokenizer_2/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": {
 3 |     "content": "<|startoftext|>",
 4 |     "lstrip": false,
 5 |     "normalized": true,
 6 |     "rstrip": false,
 7 |     "single_word": false
 8 |   },
 9 |   "eos_token": {
10 |     "content": "<|endoftext|>",
11 |     "lstrip": false,
12 |     "normalized": true,
13 |     "rstrip": false,
14 |     "single_word": false
15 |   },
16 |   "pad_token": "!",
17 |   "unk_token": {
18 |     "content": "<|endoftext|>",
19 |     "lstrip": false,
20 |     "normalized": true,
21 |     "rstrip": false,
22 |     "single_word": false
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/sdxl_config/tokenizer_2/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_prefix_space": false,
 3 |   "bos_token": {
 4 |     "__type": "AddedToken",
 5 |     "content": "<|startoftext|>",
 6 |     "lstrip": false,
 7 |     "normalized": true,
 8 |     "rstrip": false,
 9 |     "single_word": false
10 |   },
11 |   "clean_up_tokenization_spaces": true,
12 |   "do_lower_case": true,
13 |   "eos_token": {
14 |     "__type": "AddedToken",
15 |     "content": "<|endoftext|>",
16 |     "lstrip": false,
17 |     "normalized": true,
18 |     "rstrip": false,
19 |     "single_word": false
20 |   },
21 |   "errors": "replace",
22 |   "model_max_length": 77,
23 |   "pad_token": "!",
24 |   "tokenizer_class": "CLIPTokenizer",
25 |   "unk_token": {
26 |     "__type": "AddedToken",
27 |     "content": "<|endoftext|>",
28 |     "lstrip": false,
29 |     "normalized": true,
30 |     "rstrip": false,
31 |     "single_word": false
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/sdxl_config/unet/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "UNet2DConditionModel",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "addition_embed_type": "text_time",
 6 |   "addition_embed_type_num_heads": 64,
 7 |   "addition_time_embed_dim": 256,
 8 |   "attention_head_dim": [
 9 |     5,
10 |     10,
11 |     20
12 |   ],
13 |   "block_out_channels": [
14 |     320,
15 |     640,
16 |     1280
17 |   ],
18 |   "center_input_sample": false,
19 |   "class_embed_type": null,
20 |   "class_embeddings_concat": false,
21 |   "conv_in_kernel": 3,
22 |   "conv_out_kernel": 3,
23 |   "cross_attention_dim": 2048,
24 |   "cross_attention_norm": null,
25 |   "down_block_types": [
26 |     "DownBlock2D",
27 |     "CrossAttnDownBlock2D",
28 |     "CrossAttnDownBlock2D"
29 |   ],
30 |   "downsample_padding": 1,
31 |   "dual_cross_attention": false,
32 |   "encoder_hid_dim": null,
33 |   "encoder_hid_dim_type": null,
34 |   "flip_sin_to_cos": true,
35 |   "freq_shift": 0,
36 |   "in_channels": 4,
37 |   "layers_per_block": 2,
38 |   "mid_block_only_cross_attention": null,
39 |   "mid_block_scale_factor": 1,
40 |   "mid_block_type": "UNetMidBlock2DCrossAttn",
41 |   "norm_eps": 1e-05,
42 |   "norm_num_groups": 32,
43 |   "num_attention_heads": null,
44 |   "num_class_embeds": null,
45 |   "only_cross_attention": false,
46 |   "out_channels": 4,
47 |   "projection_class_embeddings_input_dim": 2816,
48 |   "resnet_out_scale_factor": 1.0,
49 |   "resnet_skip_time_act": false,
50 |   "resnet_time_scale_shift": "default",
51 |   "sample_size": 128,
52 |   "time_cond_proj_dim": null,
53 |   "time_embedding_act_fn": null,
54 |   "time_embedding_dim": null,
55 |   "time_embedding_type": "positional",
56 |   "timestep_post_act": null,
57 |   "transformer_layers_per_block": [
58 |     1,
59 |     2,
60 |     10
61 |   ],
62 |   "up_block_types": [
63 |     "CrossAttnUpBlock2D",
64 |     "CrossAttnUpBlock2D",
65 |     "UpBlock2D"
66 |   ],
67 |   "upcast_attention": null,
68 |   "use_linear_projection": true
69 | }
70 | 


--------------------------------------------------------------------------------
/sdxl_config/vae/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.20.0.dev0",
 4 |   "_name_or_path": "../sdxl-vae/",
 5 |   "act_fn": "silu",
 6 |   "block_out_channels": [
 7 |     128,
 8 |     256,
 9 |     512,
10 |     512
11 |   ],
12 |   "down_block_types": [
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D",
16 |     "DownEncoderBlock2D"
17 |   ],
18 |   "force_upcast": true,
19 |   "in_channels": 3,
20 |   "latent_channels": 4,
21 |   "layers_per_block": 2,
22 |   "norm_num_groups": 32,
23 |   "out_channels": 3,
24 |   "sample_size": 1024,
25 |   "scaling_factor": 0.13025,
26 |   "up_block_types": [
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D",
29 |     "UpDecoderBlock2D",
30 |     "UpDecoderBlock2D"
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/sdxl_config/vae_1_0/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "block_out_channels": [
 6 |     128,
 7 |     256,
 8 |     512,
 9 |     512
10 |   ],
11 |   "down_block_types": [
12 |     "DownEncoderBlock2D",
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D"
16 |   ],
17 |   "force_upcast": true,
18 |   "in_channels": 3,
19 |   "latent_channels": 4,
20 |   "layers_per_block": 2,
21 |   "norm_num_groups": 32,
22 |   "out_channels": 3,
23 |   "sample_size": 1024,
24 |   "scaling_factor": 0.13025,
25 |   "up_block_types": [
26 |     "UpDecoderBlock2D",
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D",
29 |     "UpDecoderBlock2D"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/sdxl_config/vae_decoder/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "block_out_channels": [
 6 |     128,
 7 |     256,
 8 |     512,
 9 |     512
10 |   ],
11 |   "down_block_types": [
12 |     "DownEncoderBlock2D",
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D"
16 |   ],
17 |   "force_upcast": true,
18 |   "in_channels": 3,
19 |   "latent_channels": 4,
20 |   "layers_per_block": 2,
21 |   "norm_num_groups": 32,
22 |   "out_channels": 3,
23 |   "sample_size": 1024,
24 |   "scaling_factor": 0.13025,
25 |   "up_block_types": [
26 |     "UpDecoderBlock2D",
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D",
29 |     "UpDecoderBlock2D"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/sdxl_config/vae_encoder/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.19.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "block_out_channels": [
 6 |     128,
 7 |     256,
 8 |     512,
 9 |     512
10 |   ],
11 |   "down_block_types": [
12 |     "DownEncoderBlock2D",
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D"
16 |   ],
17 |   "force_upcast": true,
18 |   "in_channels": 3,
19 |   "latent_channels": 4,
20 |   "layers_per_block": 2,
21 |   "norm_num_groups": 32,
22 |   "out_channels": 3,
23 |   "sample_size": 1024,
24 |   "scaling_factor": 0.13025,
25 |   "up_block_types": [
26 |     "UpDecoderBlock2D",
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D",
29 |     "UpDecoderBlock2D"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/weights/playground/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_class_name": "AutoencoderKL",
 3 |   "_diffusers_version": "0.27.0.dev0",
 4 |   "act_fn": "silu",
 5 |   "block_out_channels": [
 6 |     128,
 7 |     256,
 8 |     512,
 9 |     512
10 |   ],
11 |   "down_block_types": [
12 |     "DownEncoderBlock2D",
13 |     "DownEncoderBlock2D",
14 |     "DownEncoderBlock2D",
15 |     "DownEncoderBlock2D"
16 |   ],
17 |   "force_upcast": true,
18 |   "in_channels": 3,
19 |   "latent_channels": 4,
20 |   "layers_per_block": 2,
21 |   "norm_num_groups": 32,
22 |   "out_channels": 3,
23 |   "sample_size": 1024,
24 |   "up_block_types": [
25 |     "UpDecoderBlock2D",
26 |     "UpDecoderBlock2D",
27 |     "UpDecoderBlock2D",
28 |     "UpDecoderBlock2D"
29 |   ],
30 |   "latents_mean": [
31 |     -1.6574,
32 |     1.886,
33 |     -1.383,
34 |     2.5155
35 |   ],
36 |   "latents_std": [
37 |     8.4927,
38 |     5.9022,
39 |     6.5498,
40 |     5.2299
41 |   ],
42 |   "scaling_factor": 0.5
43 | }
44 | 


--------------------------------------------------------------------------------
/weights/sd15/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "./image_encoder",
 3 |   "architectures": [
 4 |     "CLIPVisionModelWithProjection"
 5 |   ],
 6 |   "attention_dropout": 0.0,
 7 |   "dropout": 0.0,
 8 |   "hidden_act": "gelu",
 9 |   "hidden_size": 1280,
10 |   "image_size": 224,
11 |   "initializer_factor": 1.0,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 5120,
14 |   "layer_norm_eps": 1e-05,
15 |   "model_type": "clip_vision_model",
16 |   "num_attention_heads": 16,
17 |   "num_channels": 3,
18 |   "num_hidden_layers": 32,
19 |   "patch_size": 14,
20 |   "projection_dim": 1024,
21 |   "torch_dtype": "float16",
22 |   "transformers_version": "4.28.0.dev0"
23 | }
24 | 


--------------------------------------------------------------------------------
/weights/sd_xl_base.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   target: sgm.models.diffusion.DiffusionEngine
 3 |   params:
 4 |     scale_factor: 0.13025
 5 |     disable_first_stage_autocast: True
 6 | 
 7 |     denoiser_config:
 8 |       target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
 9 |       params:
10 |         num_idx: 1000
11 | 
12 |         scaling_config:
13 |           target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14 |         discretization_config:
15 |           target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16 | 
17 |     network_config:
18 |       target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         adm_in_channels: 2816
21 |         num_classes: sequential
22 |         use_checkpoint: True
23 |         in_channels: 4
24 |         out_channels: 4
25 |         model_channels: 320
26 |         attention_resolutions: [4, 2]
27 |         num_res_blocks: 2
28 |         channel_mult: [1, 2, 4]
29 |         num_head_channels: 64
30 |         use_linear_in_transformer: True
31 |         transformer_depth: [1, 2, 10]
32 |         context_dim: 2048
33 |         spatial_transformer_attn_type: softmax-xformers
34 | 
35 |     conditioner_config:
36 |       target: sgm.modules.GeneralConditioner
37 |       params:
38 |         emb_models:
39 |           - is_trainable: False
40 |             input_key: txt
41 |             target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
42 |             params:
43 |               layer: hidden
44 |               layer_idx: 11
45 | 
46 |           - is_trainable: False
47 |             input_key: txt
48 |             target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
49 |             params:
50 |               arch: ViT-bigG-14
51 |               version: laion2b_s39b_b160k
52 |               freeze: True
53 |               layer: penultimate
54 |               always_return_pooled: True
55 |               legacy: False
56 | 
57 |           - is_trainable: False
58 |             input_key: original_size_as_tuple
59 |             target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
60 |             params:
61 |               outdim: 256
62 | 
63 |           - is_trainable: False
64 |             input_key: crop_coords_top_left
65 |             target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
66 |             params:
67 |               outdim: 256
68 | 
69 |           - is_trainable: False
70 |             input_key: target_size_as_tuple
71 |             target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
72 |             params:
73 |               outdim: 256
74 | 
75 |     first_stage_config:
76 |       target: sgm.models.autoencoder.AutoencoderKL
77 |       params:
78 |         embed_dim: 4
79 |         monitor: val/rec_loss
80 |         ddconfig:
81 |           attn_type: vanilla-xformers
82 |           double_z: true
83 |           z_channels: 4
84 |           resolution: 256
85 |           in_channels: 3
86 |           out_ch: 3
87 |           ch: 128
88 |           ch_mult: [1, 2, 4, 4]
89 |           num_res_blocks: 2
90 |           attn_resolutions: []
91 |           dropout: 0.0
92 |         lossconfig:
93 |           target: torch.nn.Identity
94 | 


--------------------------------------------------------------------------------
/weights/sdxl/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "CLIPVisionModelWithProjection"
 4 |   ],
 5 |   "_name_or_path": "",
 6 |   "add_cross_attention": false,
 7 |   "architectures_": null,
 8 |   "attention_dropout": 0.0,
 9 |   "bad_words_ids": null,
10 |   "begin_suppress_tokens": null,
11 |   "bos_token_id": null,
12 |   "chunk_size_feed_forward": 0,
13 |   "cross_attention_hidden_size": null,
14 |   "decoder_start_token_id": null,
15 |   "diversity_penalty": 0.0,
16 |   "do_sample": false,
17 |   "dropout": 0.0,
18 |   "early_stopping": false,
19 |   "encoder_no_repeat_ngram_size": 0,
20 |   "eos_token_id": null,
21 |   "exponential_decay_length_penalty": null,
22 |   "finetuning_task": null,
23 |   "forced_bos_token_id": null,
24 |   "forced_eos_token_id": null,
25 |   "hidden_act": "gelu",
26 |   "hidden_size": 1664,
27 |   "id2label": {
28 |     "0": "LABEL_0",
29 |     "1": "LABEL_1"
30 |       },
31 |   "image_size": 224,
32 |   "initializer_factor": 1.0,
33 |   "initializer_range": 0.02,
34 |   "intermediate_size": 8192,
35 |   "is_decoder": false,
36 |   "is_encoder_decoder": false,
37 |   "label2id": {
38 |     "LABEL_0": 0,
39 |     "LABEL_1": 1
40 |       },
41 |   "layer_norm_eps": 1e-05,
42 |   "length_penalty": 1.0,
43 |   "max_length": 20,
44 |   "min_length": 0,
45 |   "model_type": "clip_vision_model",
46 |   "no_repeat_ngram_size": 0,
47 |   "num_attention_heads": 16,
48 |   "num_beam_groups": 1,
49 |   "num_beams": 1,
50 |   "num_channels": 3,
51 |   "num_hidden_layers": 48,
52 |   "num_return_sequences": 1,
53 |   "output_attentions": false,
54 |   "output_hidden_states": false,
55 |   "output_scores": false,
56 |   "pad_token_id": null,
57 |   "patch_size": 14,
58 |   "prefix": null,
59 |   "problem_type": null,
60 |   "pruned_heads": {},
61 |   "remove_invalid_values": false,
62 |   "repetition_penalty": 1.0,
63 |   "return_dict": true,
64 |   "return_dict_in_generate": false,
65 |   "sep_token_id": null,
66 |   "suppress_tokens": null,
67 |   "task_specific_params": null,
68 |   "temperature": 1.0,
69 |   "tf_legacy_loss": false,
70 |   "tie_encoder_decoder": false,
71 |   "tie_word_embeddings": true,
72 |   "tokenizer_class": null,
73 |   "top_k": 50,
74 |   "top_p": 1.0,
75 |   "torch_dtype": null,
76 |   "torchscript": false,
77 |   "transformers_version": "4.24.0",
78 |   "typical_p": 1.0,
79 |   "use_bfloat16": false,
80 |   "projection_dim": 1280
81 | }
82 | 


--------------------------------------------------------------------------------