├── convert_original_stable_diffusion_to_diffusers.py └── convert_diffusers_to_sd.py /convert_original_stable_diffusion_to_diffusers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Conversion script for the LDM checkpoints. """ 16 | 17 | import argparse 18 | import os 19 | 20 | import torch 21 | 22 | 23 | try: 24 | from omegaconf import OmegaConf 25 | except ImportError: 26 | raise ImportError( 27 | "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`." 28 | ) 29 | 30 | from diffusers import ( 31 | AutoencoderKL, 32 | DDIMScheduler, 33 | LDMTextToImagePipeline, 34 | LMSDiscreteScheduler, 35 | PNDMScheduler, 36 | StableDiffusionPipeline, 37 | UNet2DConditionModel, 38 | ) 39 | from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel 40 | from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker 41 | from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer 42 | 43 | 44 | def shave_segments(path, n_shave_prefix_segments=1): 45 | """ 46 | Removes segments. Positive values shave the first segments, negative shave the last segments. 47 | """ 48 | if n_shave_prefix_segments >= 0: 49 | return ".".join(path.split(".")[n_shave_prefix_segments:]) 50 | else: 51 | return ".".join(path.split(".")[:n_shave_prefix_segments]) 52 | 53 | 54 | def renew_resnet_paths(old_list, n_shave_prefix_segments=0): 55 | """ 56 | Updates paths inside resnets to the new naming scheme (local renaming) 57 | """ 58 | mapping = [] 59 | for old_item in old_list: 60 | new_item = old_item.replace("in_layers.0", "norm1") 61 | new_item = new_item.replace("in_layers.2", "conv1") 62 | 63 | new_item = new_item.replace("out_layers.0", "norm2") 64 | new_item = new_item.replace("out_layers.3", "conv2") 65 | 66 | new_item = new_item.replace("emb_layers.1", "time_emb_proj") 67 | new_item = new_item.replace("skip_connection", "conv_shortcut") 68 | 69 | new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) 70 | 71 | mapping.append({"old": old_item, "new": new_item}) 72 | 73 | return mapping 74 | 75 | 76 | def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): 77 | """ 78 | Updates paths inside resnets to the new naming scheme (local renaming) 79 | """ 80 | mapping = [] 81 | for old_item in old_list: 82 | new_item = old_item 83 | 84 | new_item = new_item.replace("nin_shortcut", "conv_shortcut") 85 | new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) 86 | 87 | mapping.append({"old": old_item, "new": new_item}) 88 | 89 | return mapping 90 | 91 | 92 | def renew_attention_paths(old_list, n_shave_prefix_segments=0): 93 | """ 94 | Updates paths inside attentions to the new naming scheme (local renaming) 95 | """ 96 | mapping = [] 97 | for old_item in old_list: 98 | new_item = old_item 99 | 100 | # new_item = new_item.replace('norm.weight', 'group_norm.weight') 101 | # new_item = new_item.replace('norm.bias', 'group_norm.bias') 102 | 103 | # new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') 104 | # new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') 105 | 106 | # new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) 107 | 108 | mapping.append({"old": old_item, "new": new_item}) 109 | 110 | return mapping 111 | 112 | 113 | def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): 114 | """ 115 | Updates paths inside attentions to the new naming scheme (local renaming) 116 | """ 117 | mapping = [] 118 | for old_item in old_list: 119 | new_item = old_item 120 | 121 | new_item = new_item.replace("norm.weight", "group_norm.weight") 122 | new_item = new_item.replace("norm.bias", "group_norm.bias") 123 | 124 | new_item = new_item.replace("q.weight", "query.weight") 125 | new_item = new_item.replace("q.bias", "query.bias") 126 | 127 | new_item = new_item.replace("k.weight", "key.weight") 128 | new_item = new_item.replace("k.bias", "key.bias") 129 | 130 | new_item = new_item.replace("v.weight", "value.weight") 131 | new_item = new_item.replace("v.bias", "value.bias") 132 | 133 | new_item = new_item.replace("proj_out.weight", "proj_attn.weight") 134 | new_item = new_item.replace("proj_out.bias", "proj_attn.bias") 135 | 136 | new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) 137 | 138 | mapping.append({"old": old_item, "new": new_item}) 139 | 140 | return mapping 141 | 142 | 143 | def assign_to_checkpoint( 144 | paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None 145 | ): 146 | """ 147 | This does the final conversion step: take locally converted weights and apply a global renaming 148 | to them. It splits attention layers, and takes into account additional replacements 149 | that may arise. 150 | 151 | Assigns the weights to the new checkpoint. 152 | """ 153 | assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." 154 | 155 | # Splits the attention layers into three variables. 156 | if attention_paths_to_split is not None: 157 | for path, path_map in attention_paths_to_split.items(): 158 | old_tensor = old_checkpoint[path] 159 | channels = old_tensor.shape[0] // 3 160 | 161 | target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) 162 | 163 | num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 164 | 165 | old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) 166 | query, key, value = old_tensor.split(channels // num_heads, dim=1) 167 | 168 | checkpoint[path_map["query"]] = query.reshape(target_shape) 169 | checkpoint[path_map["key"]] = key.reshape(target_shape) 170 | checkpoint[path_map["value"]] = value.reshape(target_shape) 171 | 172 | for path in paths: 173 | new_path = path["new"] 174 | 175 | # These have already been assigned 176 | if attention_paths_to_split is not None and new_path in attention_paths_to_split: 177 | continue 178 | 179 | # Global renaming happens here 180 | new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") 181 | new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") 182 | new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") 183 | 184 | if additional_replacements is not None: 185 | for replacement in additional_replacements: 186 | new_path = new_path.replace(replacement["old"], replacement["new"]) 187 | 188 | # proj_attn.weight has to be converted from conv 1D to linear 189 | if "proj_attn.weight" in new_path: 190 | checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] 191 | else: 192 | checkpoint[new_path] = old_checkpoint[path["old"]] 193 | 194 | 195 | def conv_attn_to_linear(checkpoint): 196 | keys = list(checkpoint.keys()) 197 | attn_keys = ["query.weight", "key.weight", "value.weight"] 198 | for key in keys: 199 | if ".".join(key.split(".")[-2:]) in attn_keys: 200 | if checkpoint[key].ndim > 2: 201 | checkpoint[key] = checkpoint[key][:, :, 0, 0] 202 | elif "proj_attn.weight" in key: 203 | if checkpoint[key].ndim > 2: 204 | checkpoint[key] = checkpoint[key][:, :, 0] 205 | 206 | 207 | def create_unet_diffusers_config(original_config): 208 | """ 209 | Creates a config for the diffusers based on the config of the LDM model. 210 | """ 211 | unet_params = original_config.model.params.unet_config.params 212 | 213 | block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] 214 | 215 | down_block_types = [] 216 | resolution = 1 217 | for i in range(len(block_out_channels)): 218 | block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" 219 | down_block_types.append(block_type) 220 | if i != len(block_out_channels) - 1: 221 | resolution *= 2 222 | 223 | up_block_types = [] 224 | for i in range(len(block_out_channels)): 225 | block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" 226 | up_block_types.append(block_type) 227 | resolution //= 2 228 | 229 | config = dict( 230 | sample_size=unet_params.image_size, 231 | in_channels=unet_params.in_channels, 232 | out_channels=unet_params.out_channels, 233 | down_block_types=tuple(down_block_types), 234 | up_block_types=tuple(up_block_types), 235 | block_out_channels=tuple(block_out_channels), 236 | layers_per_block=unet_params.num_res_blocks, 237 | cross_attention_dim=unet_params.context_dim, 238 | attention_head_dim=unet_params.num_heads, 239 | ) 240 | 241 | return config 242 | 243 | 244 | def create_vae_diffusers_config(original_config): 245 | """ 246 | Creates a config for the diffusers based on the config of the LDM model. 247 | """ 248 | vae_params = original_config.model.params.first_stage_config.params.ddconfig 249 | _ = original_config.model.params.first_stage_config.params.embed_dim 250 | 251 | block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] 252 | down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) 253 | up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) 254 | 255 | config = dict( 256 | sample_size=vae_params.resolution, 257 | in_channels=vae_params.in_channels, 258 | out_channels=vae_params.out_ch, 259 | down_block_types=tuple(down_block_types), 260 | up_block_types=tuple(up_block_types), 261 | block_out_channels=tuple(block_out_channels), 262 | latent_channels=vae_params.z_channels, 263 | layers_per_block=vae_params.num_res_blocks, 264 | ) 265 | return config 266 | 267 | 268 | def create_diffusers_schedular(original_config): 269 | schedular = DDIMScheduler( 270 | num_train_timesteps=original_config.model.params.timesteps, 271 | beta_start=original_config.model.params.linear_start, 272 | beta_end=original_config.model.params.linear_end, 273 | beta_schedule="scaled_linear", 274 | ) 275 | return schedular 276 | 277 | 278 | def create_ldm_bert_config(original_config): 279 | bert_params = original_config.model.parms.cond_stage_config.params 280 | config = LDMBertConfig( 281 | d_model=bert_params.n_embed, 282 | encoder_layers=bert_params.n_layer, 283 | encoder_ffn_dim=bert_params.n_embed * 4, 284 | ) 285 | return config 286 | 287 | 288 | def convert_ldm_unet_checkpoint(checkpoint, config): 289 | """ 290 | Takes a state dict and a config, and returns a converted checkpoint. 291 | """ 292 | 293 | # extract state_dict for UNet 294 | unet_state_dict = {} 295 | unet_key = "model.diffusion_model." 296 | keys = list(checkpoint.keys()) 297 | for key in keys: 298 | if key.startswith(unet_key): 299 | unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) 300 | 301 | new_checkpoint = {} 302 | 303 | new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] 304 | new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] 305 | new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] 306 | new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] 307 | 308 | new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] 309 | new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] 310 | 311 | new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] 312 | new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] 313 | new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] 314 | new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] 315 | 316 | # Retrieves the keys for the input blocks only 317 | num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) 318 | input_blocks = { 319 | layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] 320 | for layer_id in range(num_input_blocks) 321 | } 322 | 323 | # Retrieves the keys for the middle blocks only 324 | num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) 325 | middle_blocks = { 326 | layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] 327 | for layer_id in range(num_middle_blocks) 328 | } 329 | 330 | # Retrieves the keys for the output blocks only 331 | num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) 332 | output_blocks = { 333 | layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] 334 | for layer_id in range(num_output_blocks) 335 | } 336 | 337 | for i in range(1, num_input_blocks): 338 | block_id = (i - 1) // (config["layers_per_block"] + 1) 339 | layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) 340 | 341 | resnets = [ 342 | key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key 343 | ] 344 | attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] 345 | 346 | if f"input_blocks.{i}.0.op.weight" in unet_state_dict: 347 | new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( 348 | f"input_blocks.{i}.0.op.weight" 349 | ) 350 | new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( 351 | f"input_blocks.{i}.0.op.bias" 352 | ) 353 | 354 | paths = renew_resnet_paths(resnets) 355 | meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} 356 | assign_to_checkpoint( 357 | paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config 358 | ) 359 | 360 | if len(attentions): 361 | paths = renew_attention_paths(attentions) 362 | meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} 363 | assign_to_checkpoint( 364 | paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config 365 | ) 366 | 367 | resnet_0 = middle_blocks[0] 368 | attentions = middle_blocks[1] 369 | resnet_1 = middle_blocks[2] 370 | 371 | resnet_0_paths = renew_resnet_paths(resnet_0) 372 | assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) 373 | 374 | resnet_1_paths = renew_resnet_paths(resnet_1) 375 | assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) 376 | 377 | attentions_paths = renew_attention_paths(attentions) 378 | meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} 379 | assign_to_checkpoint( 380 | attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config 381 | ) 382 | 383 | for i in range(num_output_blocks): 384 | block_id = i // (config["layers_per_block"] + 1) 385 | layer_in_block_id = i % (config["layers_per_block"] + 1) 386 | output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] 387 | output_block_list = {} 388 | 389 | for layer in output_block_layers: 390 | layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) 391 | if layer_id in output_block_list: 392 | output_block_list[layer_id].append(layer_name) 393 | else: 394 | output_block_list[layer_id] = [layer_name] 395 | 396 | if len(output_block_list) > 1: 397 | resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] 398 | attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] 399 | 400 | resnet_0_paths = renew_resnet_paths(resnets) 401 | paths = renew_resnet_paths(resnets) 402 | 403 | meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} 404 | assign_to_checkpoint( 405 | paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config 406 | ) 407 | 408 | if ["conv.weight", "conv.bias"] in output_block_list.values(): 409 | index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) 410 | new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ 411 | f"output_blocks.{i}.{index}.conv.weight" 412 | ] 413 | new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ 414 | f"output_blocks.{i}.{index}.conv.bias" 415 | ] 416 | 417 | # Clear attentions as they have been attributed above. 418 | if len(attentions) == 2: 419 | attentions = [] 420 | 421 | if len(attentions): 422 | paths = renew_attention_paths(attentions) 423 | meta_path = { 424 | "old": f"output_blocks.{i}.1", 425 | "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", 426 | } 427 | assign_to_checkpoint( 428 | paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config 429 | ) 430 | else: 431 | resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) 432 | for path in resnet_0_paths: 433 | old_path = ".".join(["output_blocks", str(i), path["old"]]) 434 | new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) 435 | 436 | new_checkpoint[new_path] = unet_state_dict[old_path] 437 | 438 | return new_checkpoint 439 | 440 | 441 | def convert_ldm_vae_checkpoint(checkpoint, config): 442 | # extract state dict for VAE 443 | vae_state_dict = {} 444 | vae_key = "first_stage_model." 445 | keys = list(checkpoint.keys()) 446 | for key in keys: 447 | if key.startswith(vae_key): 448 | vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) 449 | 450 | new_checkpoint = {} 451 | 452 | new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] 453 | new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] 454 | new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] 455 | new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] 456 | new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] 457 | new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] 458 | 459 | new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] 460 | new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] 461 | new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] 462 | new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] 463 | new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] 464 | new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] 465 | 466 | new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] 467 | new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] 468 | new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] 469 | new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] 470 | 471 | # Retrieves the keys for the encoder down blocks only 472 | num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) 473 | down_blocks = { 474 | layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) 475 | } 476 | 477 | # Retrieves the keys for the decoder up blocks only 478 | num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) 479 | up_blocks = { 480 | layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) 481 | } 482 | 483 | for i in range(num_down_blocks): 484 | resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] 485 | 486 | if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: 487 | new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( 488 | f"encoder.down.{i}.downsample.conv.weight" 489 | ) 490 | new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( 491 | f"encoder.down.{i}.downsample.conv.bias" 492 | ) 493 | 494 | paths = renew_vae_resnet_paths(resnets) 495 | meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} 496 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 497 | 498 | mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] 499 | num_mid_res_blocks = 2 500 | for i in range(1, num_mid_res_blocks + 1): 501 | resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] 502 | 503 | paths = renew_vae_resnet_paths(resnets) 504 | meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} 505 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 506 | 507 | mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] 508 | paths = renew_vae_attention_paths(mid_attentions) 509 | meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} 510 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 511 | conv_attn_to_linear(new_checkpoint) 512 | 513 | for i in range(num_up_blocks): 514 | block_id = num_up_blocks - 1 - i 515 | resnets = [ 516 | key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key 517 | ] 518 | 519 | if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: 520 | new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ 521 | f"decoder.up.{block_id}.upsample.conv.weight" 522 | ] 523 | new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ 524 | f"decoder.up.{block_id}.upsample.conv.bias" 525 | ] 526 | 527 | paths = renew_vae_resnet_paths(resnets) 528 | meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} 529 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 530 | 531 | mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] 532 | num_mid_res_blocks = 2 533 | for i in range(1, num_mid_res_blocks + 1): 534 | resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] 535 | 536 | paths = renew_vae_resnet_paths(resnets) 537 | meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} 538 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 539 | 540 | mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] 541 | paths = renew_vae_attention_paths(mid_attentions) 542 | meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} 543 | assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) 544 | conv_attn_to_linear(new_checkpoint) 545 | return new_checkpoint 546 | 547 | 548 | def convert_ldm_bert_checkpoint(checkpoint, config): 549 | def _copy_attn_layer(hf_attn_layer, pt_attn_layer): 550 | hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight 551 | hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight 552 | hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight 553 | 554 | hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight 555 | hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias 556 | 557 | def _copy_linear(hf_linear, pt_linear): 558 | hf_linear.weight = pt_linear.weight 559 | hf_linear.bias = pt_linear.bias 560 | 561 | def _copy_layer(hf_layer, pt_layer): 562 | # copy layer norms 563 | _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) 564 | _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) 565 | 566 | # copy attn 567 | _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) 568 | 569 | # copy MLP 570 | pt_mlp = pt_layer[1][1] 571 | _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) 572 | _copy_linear(hf_layer.fc2, pt_mlp.net[2]) 573 | 574 | def _copy_layers(hf_layers, pt_layers): 575 | for i, hf_layer in enumerate(hf_layers): 576 | if i != 0: 577 | i += i 578 | pt_layer = pt_layers[i : i + 2] 579 | _copy_layer(hf_layer, pt_layer) 580 | 581 | hf_model = LDMBertModel(config).eval() 582 | 583 | # copy embeds 584 | hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight 585 | hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight 586 | 587 | # copy layer norm 588 | _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) 589 | 590 | # copy hidden layers 591 | _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) 592 | 593 | _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) 594 | 595 | return hf_model 596 | 597 | 598 | def convert_ldm_clip_checkpoint(checkpoint): 599 | text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") 600 | 601 | keys = list(checkpoint.keys()) 602 | 603 | text_model_dict = {} 604 | 605 | for key in keys: 606 | if key.startswith("cond_stage_model.transformer"): 607 | text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] 608 | 609 | text_model.load_state_dict(text_model_dict) 610 | 611 | return text_model 612 | 613 | 614 | if __name__ == "__main__": 615 | parser = argparse.ArgumentParser() 616 | 617 | parser.add_argument( 618 | "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." 619 | ) 620 | # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml 621 | parser.add_argument( 622 | "--original_config_file", 623 | default=None, 624 | type=str, 625 | help="The YAML config file corresponding to the original architecture.", 626 | ) 627 | parser.add_argument( 628 | "--scheduler_type", 629 | default="pndm", 630 | type=str, 631 | help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']", 632 | ) 633 | parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") 634 | 635 | args = parser.parse_args() 636 | 637 | if args.original_config_file is None: 638 | os.system( 639 | "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" 640 | ) 641 | args.original_config_file = "./v1-inference.yaml" 642 | 643 | original_config = OmegaConf.load(args.original_config_file) 644 | checkpoint = torch.load(args.checkpoint_path)["state_dict"] 645 | print("checkpoint loaded") 646 | 647 | num_train_timesteps = original_config.model.params.timesteps 648 | beta_start = original_config.model.params.linear_start 649 | beta_end = original_config.model.params.linear_end 650 | if args.scheduler_type == "pndm": 651 | scheduler = PNDMScheduler( 652 | beta_end=beta_end, 653 | beta_schedule="scaled_linear", 654 | beta_start=beta_start, 655 | num_train_timesteps=num_train_timesteps, 656 | skip_prk_steps=True, 657 | ) 658 | elif args.scheduler_type == "lms": 659 | scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") 660 | elif args.scheduler_type == "ddim": 661 | scheduler = DDIMScheduler( 662 | beta_start=beta_start, 663 | beta_end=beta_end, 664 | beta_schedule="scaled_linear", 665 | clip_sample=False, 666 | set_alpha_to_one=False, 667 | ) 668 | else: 669 | raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") 670 | 671 | # Convert the UNet2DConditionModel model. 672 | unet_config = create_unet_diffusers_config(original_config) 673 | converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) 674 | print("unet converted") 675 | 676 | unet = UNet2DConditionModel(**unet_config) 677 | unet.load_state_dict(converted_unet_checkpoint) 678 | del converted_unet_checkpoint 679 | 680 | print("unet loaded dict") 681 | 682 | # Convert the VAE model. 683 | vae_config = create_vae_diffusers_config(original_config) 684 | converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) 685 | 686 | print("vae converted") 687 | 688 | vae = AutoencoderKL(**vae_config) 689 | vae.load_state_dict(converted_vae_checkpoint) 690 | del converted_vae_checkpoint 691 | 692 | print("vae loaded") 693 | 694 | # Convert the text model. 695 | text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] 696 | if text_model_type == "FrozenCLIPEmbedder": 697 | print("converting clip") 698 | text_model = convert_ldm_clip_checkpoint(checkpoint) 699 | del checkpoint 700 | print("converting clip done") 701 | tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") 702 | print("safety checker") 703 | safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") 704 | print("feature extractor") 705 | feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") 706 | print("pipe") 707 | pipe = StableDiffusionPipeline( 708 | vae=vae, 709 | text_encoder=text_model, 710 | tokenizer=tokenizer, 711 | unet=unet, 712 | scheduler=scheduler, 713 | safety_checker=safety_checker, 714 | feature_extractor=feature_extractor, 715 | ) 716 | else: 717 | text_config = create_ldm_bert_config(original_config) 718 | text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) 719 | tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 720 | pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) 721 | 722 | print("saving") 723 | pipe.save_pretrained(args.dump_path) 724 | -------------------------------------------------------------------------------- /convert_diffusers_to_sd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # NOTE: use https://gist.github.com/jachiam/8a5c0b607e38fcc585168b90c686eb05 4 | # it's nicer 5 | 6 | # by ratwithashotgun, freely usable 7 | # converts from huggingface diffusers StableDiffusionPipeline models to original stablediffusion ckpt files. 8 | # this requires you to have the original model that was used to create the diff mode, for most dreambooth cases currently that's SD 1.4. 9 | 10 | # requirements: pip install torch diffusers transformers pytorch-lightning 11 | # usage: ./convert_diffusers_to_sd.py path_to_your_huggingface_model path_to_sd-v1-4.ckpt where_to_save.ckpt 12 | 13 | import argparse 14 | import os 15 | 16 | import torch 17 | from diffusers import StableDiffusionPipeline, DiffusionPipeline 18 | 19 | # yes this is horrid 20 | KeyMap = { 21 | "model.diffusion_model.time_embed.0.weight": "time_embedding.linear_1.weight", 22 | "model.diffusion_model.time_embed.0.bias": "time_embedding.linear_1.bias", 23 | "model.diffusion_model.time_embed.2.weight": "time_embedding.linear_2.weight", 24 | "model.diffusion_model.time_embed.2.bias": "time_embedding.linear_2.bias", 25 | "model.diffusion_model.input_blocks.0.0.weight": "conv_in.weight", 26 | "model.diffusion_model.input_blocks.0.0.bias": "conv_in.bias", 27 | "model.diffusion_model.out.0.weight": "conv_norm_out.weight", 28 | "model.diffusion_model.out.0.bias": "conv_norm_out.bias", 29 | "model.diffusion_model.out.2.weight": "conv_out.weight", 30 | "model.diffusion_model.out.2.bias": "conv_out.bias", 31 | "model.diffusion_model.input_blocks.1.0.in_layers.0.weight": "down_blocks.0.resnets.0.norm1.weight", 32 | "model.diffusion_model.input_blocks.1.0.in_layers.0.bias": "down_blocks.0.resnets.0.norm1.bias", 33 | "model.diffusion_model.input_blocks.1.0.in_layers.2.weight": "down_blocks.0.resnets.0.conv1.weight", 34 | "model.diffusion_model.input_blocks.1.0.in_layers.2.bias": "down_blocks.0.resnets.0.conv1.bias", 35 | "model.diffusion_model.input_blocks.1.0.emb_layers.1.weight": "down_blocks.0.resnets.0.time_emb_proj.weight", 36 | "model.diffusion_model.input_blocks.1.0.emb_layers.1.bias": "down_blocks.0.resnets.0.time_emb_proj.bias", 37 | "model.diffusion_model.input_blocks.1.0.out_layers.0.weight": "down_blocks.0.resnets.0.norm2.weight", 38 | "model.diffusion_model.input_blocks.1.0.out_layers.0.bias": "down_blocks.0.resnets.0.norm2.bias", 39 | "model.diffusion_model.input_blocks.1.0.out_layers.3.weight": "down_blocks.0.resnets.0.conv2.weight", 40 | "model.diffusion_model.input_blocks.1.0.out_layers.3.bias": "down_blocks.0.resnets.0.conv2.bias", 41 | "model.diffusion_model.input_blocks.1.1.norm.weight": "down_blocks.0.attentions.0.norm.weight", 42 | "model.diffusion_model.input_blocks.1.1.norm.bias": "down_blocks.0.attentions.0.norm.bias", 43 | "model.diffusion_model.input_blocks.1.1.proj_in.weight": "down_blocks.0.attentions.0.proj_in.weight", 44 | "model.diffusion_model.input_blocks.1.1.proj_in.bias": "down_blocks.0.attentions.0.proj_in.bias", 45 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q.weight", 46 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k.weight", 47 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v.weight", 48 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 49 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 50 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 51 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 52 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2.weight", 53 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2.bias", 54 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q.weight", 55 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k.weight", 56 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v.weight", 57 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 58 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 59 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.weight": "down_blocks.0.attentions.0.transformer_blocks.0.norm1.weight", 60 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm1.bias": "down_blocks.0.attentions.0.transformer_blocks.0.norm1.bias", 61 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.weight": "down_blocks.0.attentions.0.transformer_blocks.0.norm2.weight", 62 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm2.bias": "down_blocks.0.attentions.0.transformer_blocks.0.norm2.bias", 63 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.weight": "down_blocks.0.attentions.0.transformer_blocks.0.norm3.weight", 64 | "model.diffusion_model.input_blocks.1.1.transformer_blocks.0.norm3.bias": "down_blocks.0.attentions.0.transformer_blocks.0.norm3.bias", 65 | "model.diffusion_model.input_blocks.1.1.proj_out.weight": "down_blocks.0.attentions.0.proj_out.weight", 66 | "model.diffusion_model.input_blocks.1.1.proj_out.bias": "down_blocks.0.attentions.0.proj_out.bias", 67 | "model.diffusion_model.input_blocks.2.0.in_layers.0.weight": "down_blocks.0.resnets.1.norm1.weight", 68 | "model.diffusion_model.input_blocks.2.0.in_layers.0.bias": "down_blocks.0.resnets.1.norm1.bias", 69 | "model.diffusion_model.input_blocks.2.0.in_layers.2.weight": "down_blocks.0.resnets.1.conv1.weight", 70 | "model.diffusion_model.input_blocks.2.0.in_layers.2.bias": "down_blocks.0.resnets.1.conv1.bias", 71 | "model.diffusion_model.input_blocks.2.0.emb_layers.1.weight": "down_blocks.0.resnets.1.time_emb_proj.weight", 72 | "model.diffusion_model.input_blocks.2.0.emb_layers.1.bias": "down_blocks.0.resnets.1.time_emb_proj.bias", 73 | "model.diffusion_model.input_blocks.2.0.out_layers.0.weight": "down_blocks.0.resnets.1.norm2.weight", 74 | "model.diffusion_model.input_blocks.2.0.out_layers.0.bias": "down_blocks.0.resnets.1.norm2.bias", 75 | "model.diffusion_model.input_blocks.2.0.out_layers.3.weight": "down_blocks.0.resnets.1.conv2.weight", 76 | "model.diffusion_model.input_blocks.2.0.out_layers.3.bias": "down_blocks.0.resnets.1.conv2.bias", 77 | "model.diffusion_model.input_blocks.2.1.norm.weight": "down_blocks.0.attentions.1.norm.weight", 78 | "model.diffusion_model.input_blocks.2.1.norm.bias": "down_blocks.0.attentions.1.norm.bias", 79 | "model.diffusion_model.input_blocks.2.1.proj_in.weight": "down_blocks.0.attentions.1.proj_in.weight", 80 | "model.diffusion_model.input_blocks.2.1.proj_in.bias": "down_blocks.0.attentions.1.proj_in.bias", 81 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q.weight", 82 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k.weight", 83 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v.weight", 84 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 85 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 86 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 87 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 88 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2.weight", 89 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2.bias", 90 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q.weight", 91 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k.weight", 92 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v.weight", 93 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 94 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 95 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.weight": "down_blocks.0.attentions.1.transformer_blocks.0.norm1.weight", 96 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm1.bias": "down_blocks.0.attentions.1.transformer_blocks.0.norm1.bias", 97 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.weight": "down_blocks.0.attentions.1.transformer_blocks.0.norm2.weight", 98 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm2.bias": "down_blocks.0.attentions.1.transformer_blocks.0.norm2.bias", 99 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.weight": "down_blocks.0.attentions.1.transformer_blocks.0.norm3.weight", 100 | "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.norm3.bias": "down_blocks.0.attentions.1.transformer_blocks.0.norm3.bias", 101 | "model.diffusion_model.input_blocks.2.1.proj_out.weight": "down_blocks.0.attentions.1.proj_out.weight", 102 | "model.diffusion_model.input_blocks.2.1.proj_out.bias": "down_blocks.0.attentions.1.proj_out.bias", 103 | "model.diffusion_model.input_blocks.3.0.op.weight": "down_blocks.0.downsamplers.0.conv.weight", 104 | "model.diffusion_model.input_blocks.3.0.op.bias": "down_blocks.0.downsamplers.0.conv.bias", 105 | "model.diffusion_model.input_blocks.4.0.in_layers.0.weight": "down_blocks.1.resnets.0.norm1.weight", 106 | "model.diffusion_model.input_blocks.4.0.in_layers.0.bias": "down_blocks.1.resnets.0.norm1.bias", 107 | "model.diffusion_model.input_blocks.4.0.in_layers.2.weight": "down_blocks.1.resnets.0.conv1.weight", 108 | "model.diffusion_model.input_blocks.4.0.in_layers.2.bias": "down_blocks.1.resnets.0.conv1.bias", 109 | "model.diffusion_model.input_blocks.4.0.emb_layers.1.weight": "down_blocks.1.resnets.0.time_emb_proj.weight", 110 | "model.diffusion_model.input_blocks.4.0.emb_layers.1.bias": "down_blocks.1.resnets.0.time_emb_proj.bias", 111 | "model.diffusion_model.input_blocks.4.0.out_layers.0.weight": "down_blocks.1.resnets.0.norm2.weight", 112 | "model.diffusion_model.input_blocks.4.0.out_layers.0.bias": "down_blocks.1.resnets.0.norm2.bias", 113 | "model.diffusion_model.input_blocks.4.0.out_layers.3.weight": "down_blocks.1.resnets.0.conv2.weight", 114 | "model.diffusion_model.input_blocks.4.0.out_layers.3.bias": "down_blocks.1.resnets.0.conv2.bias", 115 | "model.diffusion_model.input_blocks.4.0.skip_connection.weight": "down_blocks.1.resnets.0.conv_shortcut.weight", 116 | "model.diffusion_model.input_blocks.4.0.skip_connection.bias": "down_blocks.1.resnets.0.conv_shortcut.bias", 117 | "model.diffusion_model.input_blocks.4.1.norm.weight": "down_blocks.1.attentions.0.norm.weight", 118 | "model.diffusion_model.input_blocks.4.1.norm.bias": "down_blocks.1.attentions.0.norm.bias", 119 | "model.diffusion_model.input_blocks.4.1.proj_in.weight": "down_blocks.1.attentions.0.proj_in.weight", 120 | "model.diffusion_model.input_blocks.4.1.proj_in.bias": "down_blocks.1.attentions.0.proj_in.bias", 121 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.weight", 122 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.weight", 123 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.weight", 124 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 125 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 126 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 127 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 128 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.weight", 129 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.bias", 130 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.weight", 131 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.weight", 132 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.weight", 133 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 134 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 135 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.weight": "down_blocks.1.attentions.0.transformer_blocks.0.norm1.weight", 136 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm1.bias": "down_blocks.1.attentions.0.transformer_blocks.0.norm1.bias", 137 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.weight": "down_blocks.1.attentions.0.transformer_blocks.0.norm2.weight", 138 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm2.bias": "down_blocks.1.attentions.0.transformer_blocks.0.norm2.bias", 139 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.weight": "down_blocks.1.attentions.0.transformer_blocks.0.norm3.weight", 140 | "model.diffusion_model.input_blocks.4.1.transformer_blocks.0.norm3.bias": "down_blocks.1.attentions.0.transformer_blocks.0.norm3.bias", 141 | "model.diffusion_model.input_blocks.4.1.proj_out.weight": "down_blocks.1.attentions.0.proj_out.weight", 142 | "model.diffusion_model.input_blocks.4.1.proj_out.bias": "down_blocks.1.attentions.0.proj_out.bias", 143 | "model.diffusion_model.input_blocks.5.0.in_layers.0.weight": "down_blocks.1.resnets.1.norm1.weight", 144 | "model.diffusion_model.input_blocks.5.0.in_layers.0.bias": "down_blocks.1.resnets.1.norm1.bias", 145 | "model.diffusion_model.input_blocks.5.0.in_layers.2.weight": "down_blocks.1.resnets.1.conv1.weight", 146 | "model.diffusion_model.input_blocks.5.0.in_layers.2.bias": "down_blocks.1.resnets.1.conv1.bias", 147 | "model.diffusion_model.input_blocks.5.0.emb_layers.1.weight": "down_blocks.1.resnets.1.time_emb_proj.weight", 148 | "model.diffusion_model.input_blocks.5.0.emb_layers.1.bias": "down_blocks.1.resnets.1.time_emb_proj.bias", 149 | "model.diffusion_model.input_blocks.5.0.out_layers.0.weight": "down_blocks.1.resnets.1.norm2.weight", 150 | "model.diffusion_model.input_blocks.5.0.out_layers.0.bias": "down_blocks.1.resnets.1.norm2.bias", 151 | "model.diffusion_model.input_blocks.5.0.out_layers.3.weight": "down_blocks.1.resnets.1.conv2.weight", 152 | "model.diffusion_model.input_blocks.5.0.out_layers.3.bias": "down_blocks.1.resnets.1.conv2.bias", 153 | "model.diffusion_model.input_blocks.5.1.norm.weight": "down_blocks.1.attentions.1.norm.weight", 154 | "model.diffusion_model.input_blocks.5.1.norm.bias": "down_blocks.1.attentions.1.norm.bias", 155 | "model.diffusion_model.input_blocks.5.1.proj_in.weight": "down_blocks.1.attentions.1.proj_in.weight", 156 | "model.diffusion_model.input_blocks.5.1.proj_in.bias": "down_blocks.1.attentions.1.proj_in.bias", 157 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.weight", 158 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.weight", 159 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.weight", 160 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 161 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 162 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 163 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 164 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.weight", 165 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.bias", 166 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.weight", 167 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.weight", 168 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.weight", 169 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 170 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 171 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.weight": "down_blocks.1.attentions.1.transformer_blocks.0.norm1.weight", 172 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm1.bias": "down_blocks.1.attentions.1.transformer_blocks.0.norm1.bias", 173 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.weight": "down_blocks.1.attentions.1.transformer_blocks.0.norm2.weight", 174 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm2.bias": "down_blocks.1.attentions.1.transformer_blocks.0.norm2.bias", 175 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.weight": "down_blocks.1.attentions.1.transformer_blocks.0.norm3.weight", 176 | "model.diffusion_model.input_blocks.5.1.transformer_blocks.0.norm3.bias": "down_blocks.1.attentions.1.transformer_blocks.0.norm3.bias", 177 | "model.diffusion_model.input_blocks.5.1.proj_out.weight": "down_blocks.1.attentions.1.proj_out.weight", 178 | "model.diffusion_model.input_blocks.5.1.proj_out.bias": "down_blocks.1.attentions.1.proj_out.bias", 179 | "model.diffusion_model.input_blocks.6.0.op.weight": "down_blocks.1.downsamplers.0.conv.weight", 180 | "model.diffusion_model.input_blocks.6.0.op.bias": "down_blocks.1.downsamplers.0.conv.bias", 181 | "model.diffusion_model.input_blocks.7.0.in_layers.0.weight": "down_blocks.2.resnets.0.norm1.weight", 182 | "model.diffusion_model.input_blocks.7.0.in_layers.0.bias": "down_blocks.2.resnets.0.norm1.bias", 183 | "model.diffusion_model.input_blocks.7.0.in_layers.2.weight": "down_blocks.2.resnets.0.conv1.weight", 184 | "model.diffusion_model.input_blocks.7.0.in_layers.2.bias": "down_blocks.2.resnets.0.conv1.bias", 185 | "model.diffusion_model.input_blocks.7.0.emb_layers.1.weight": "down_blocks.2.resnets.0.time_emb_proj.weight", 186 | "model.diffusion_model.input_blocks.7.0.emb_layers.1.bias": "down_blocks.2.resnets.0.time_emb_proj.bias", 187 | "model.diffusion_model.input_blocks.7.0.out_layers.0.weight": "down_blocks.2.resnets.0.norm2.weight", 188 | "model.diffusion_model.input_blocks.7.0.out_layers.0.bias": "down_blocks.2.resnets.0.norm2.bias", 189 | "model.diffusion_model.input_blocks.7.0.out_layers.3.weight": "down_blocks.2.resnets.0.conv2.weight", 190 | "model.diffusion_model.input_blocks.7.0.out_layers.3.bias": "down_blocks.2.resnets.0.conv2.bias", 191 | "model.diffusion_model.input_blocks.7.0.skip_connection.weight": "down_blocks.2.resnets.0.conv_shortcut.weight", 192 | "model.diffusion_model.input_blocks.7.0.skip_connection.bias": "down_blocks.2.resnets.0.conv_shortcut.bias", 193 | "model.diffusion_model.input_blocks.7.1.norm.weight": "down_blocks.2.attentions.0.norm.weight", 194 | "model.diffusion_model.input_blocks.7.1.norm.bias": "down_blocks.2.attentions.0.norm.bias", 195 | "model.diffusion_model.input_blocks.7.1.proj_in.weight": "down_blocks.2.attentions.0.proj_in.weight", 196 | "model.diffusion_model.input_blocks.7.1.proj_in.bias": "down_blocks.2.attentions.0.proj_in.bias", 197 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.weight", 198 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.weight", 199 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.weight", 200 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 201 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 202 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 203 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 204 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.weight", 205 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.bias", 206 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.weight", 207 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.weight", 208 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.weight", 209 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 210 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 211 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.weight": "down_blocks.2.attentions.0.transformer_blocks.0.norm1.weight", 212 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm1.bias": "down_blocks.2.attentions.0.transformer_blocks.0.norm1.bias", 213 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.weight": "down_blocks.2.attentions.0.transformer_blocks.0.norm2.weight", 214 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm2.bias": "down_blocks.2.attentions.0.transformer_blocks.0.norm2.bias", 215 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.weight": "down_blocks.2.attentions.0.transformer_blocks.0.norm3.weight", 216 | "model.diffusion_model.input_blocks.7.1.transformer_blocks.0.norm3.bias": "down_blocks.2.attentions.0.transformer_blocks.0.norm3.bias", 217 | "model.diffusion_model.input_blocks.7.1.proj_out.weight": "down_blocks.2.attentions.0.proj_out.weight", 218 | "model.diffusion_model.input_blocks.7.1.proj_out.bias": "down_blocks.2.attentions.0.proj_out.bias", 219 | "model.diffusion_model.input_blocks.8.0.in_layers.0.weight": "down_blocks.2.resnets.1.norm1.weight", 220 | "model.diffusion_model.input_blocks.8.0.in_layers.0.bias": "down_blocks.2.resnets.1.norm1.bias", 221 | "model.diffusion_model.input_blocks.8.0.in_layers.2.weight": "down_blocks.2.resnets.1.conv1.weight", 222 | "model.diffusion_model.input_blocks.8.0.in_layers.2.bias": "down_blocks.2.resnets.1.conv1.bias", 223 | "model.diffusion_model.input_blocks.8.0.emb_layers.1.weight": "down_blocks.2.resnets.1.time_emb_proj.weight", 224 | "model.diffusion_model.input_blocks.8.0.emb_layers.1.bias": "down_blocks.2.resnets.1.time_emb_proj.bias", 225 | "model.diffusion_model.input_blocks.8.0.out_layers.0.weight": "down_blocks.2.resnets.1.norm2.weight", 226 | "model.diffusion_model.input_blocks.8.0.out_layers.0.bias": "down_blocks.2.resnets.1.norm2.bias", 227 | "model.diffusion_model.input_blocks.8.0.out_layers.3.weight": "down_blocks.2.resnets.1.conv2.weight", 228 | "model.diffusion_model.input_blocks.8.0.out_layers.3.bias": "down_blocks.2.resnets.1.conv2.bias", 229 | "model.diffusion_model.input_blocks.8.1.norm.weight": "down_blocks.2.attentions.1.norm.weight", 230 | "model.diffusion_model.input_blocks.8.1.norm.bias": "down_blocks.2.attentions.1.norm.bias", 231 | "model.diffusion_model.input_blocks.8.1.proj_in.weight": "down_blocks.2.attentions.1.proj_in.weight", 232 | "model.diffusion_model.input_blocks.8.1.proj_in.bias": "down_blocks.2.attentions.1.proj_in.bias", 233 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.weight", 234 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.weight", 235 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.weight", 236 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 237 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 238 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 239 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 240 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.weight", 241 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.bias", 242 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.weight", 243 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.weight", 244 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.weight", 245 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 246 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 247 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.weight": "down_blocks.2.attentions.1.transformer_blocks.0.norm1.weight", 248 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm1.bias": "down_blocks.2.attentions.1.transformer_blocks.0.norm1.bias", 249 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.weight": "down_blocks.2.attentions.1.transformer_blocks.0.norm2.weight", 250 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm2.bias": "down_blocks.2.attentions.1.transformer_blocks.0.norm2.bias", 251 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.weight": "down_blocks.2.attentions.1.transformer_blocks.0.norm3.weight", 252 | "model.diffusion_model.input_blocks.8.1.transformer_blocks.0.norm3.bias": "down_blocks.2.attentions.1.transformer_blocks.0.norm3.bias", 253 | "model.diffusion_model.input_blocks.8.1.proj_out.weight": "down_blocks.2.attentions.1.proj_out.weight", 254 | "model.diffusion_model.input_blocks.8.1.proj_out.bias": "down_blocks.2.attentions.1.proj_out.bias", 255 | "model.diffusion_model.input_blocks.9.0.op.weight": "down_blocks.2.downsamplers.0.conv.weight", 256 | "model.diffusion_model.input_blocks.9.0.op.bias": "down_blocks.2.downsamplers.0.conv.bias", 257 | "model.diffusion_model.input_blocks.10.0.in_layers.0.weight": "down_blocks.3.resnets.0.norm1.weight", 258 | "model.diffusion_model.input_blocks.10.0.in_layers.0.bias": "down_blocks.3.resnets.0.norm1.bias", 259 | "model.diffusion_model.input_blocks.10.0.in_layers.2.weight": "down_blocks.3.resnets.0.conv1.weight", 260 | "model.diffusion_model.input_blocks.10.0.in_layers.2.bias": "down_blocks.3.resnets.0.conv1.bias", 261 | "model.diffusion_model.input_blocks.10.0.emb_layers.1.weight": "down_blocks.3.resnets.0.time_emb_proj.weight", 262 | "model.diffusion_model.input_blocks.10.0.emb_layers.1.bias": "down_blocks.3.resnets.0.time_emb_proj.bias", 263 | "model.diffusion_model.input_blocks.10.0.out_layers.0.weight": "down_blocks.3.resnets.0.norm2.weight", 264 | "model.diffusion_model.input_blocks.10.0.out_layers.0.bias": "down_blocks.3.resnets.0.norm2.bias", 265 | "model.diffusion_model.input_blocks.10.0.out_layers.3.weight": "down_blocks.3.resnets.0.conv2.weight", 266 | "model.diffusion_model.input_blocks.10.0.out_layers.3.bias": "down_blocks.3.resnets.0.conv2.bias", 267 | "model.diffusion_model.input_blocks.11.0.in_layers.0.weight": "down_blocks.3.resnets.1.norm1.weight", 268 | "model.diffusion_model.input_blocks.11.0.in_layers.0.bias": "down_blocks.3.resnets.1.norm1.bias", 269 | "model.diffusion_model.input_blocks.11.0.in_layers.2.weight": "down_blocks.3.resnets.1.conv1.weight", 270 | "model.diffusion_model.input_blocks.11.0.in_layers.2.bias": "down_blocks.3.resnets.1.conv1.bias", 271 | "model.diffusion_model.input_blocks.11.0.emb_layers.1.weight": "down_blocks.3.resnets.1.time_emb_proj.weight", 272 | "model.diffusion_model.input_blocks.11.0.emb_layers.1.bias": "down_blocks.3.resnets.1.time_emb_proj.bias", 273 | "model.diffusion_model.input_blocks.11.0.out_layers.0.weight": "down_blocks.3.resnets.1.norm2.weight", 274 | "model.diffusion_model.input_blocks.11.0.out_layers.0.bias": "down_blocks.3.resnets.1.norm2.bias", 275 | "model.diffusion_model.input_blocks.11.0.out_layers.3.weight": "down_blocks.3.resnets.1.conv2.weight", 276 | "model.diffusion_model.input_blocks.11.0.out_layers.3.bias": "down_blocks.3.resnets.1.conv2.bias", 277 | "model.diffusion_model.middle_block.0.in_layers.0.weight": "mid_block.resnets.0.norm1.weight", 278 | "model.diffusion_model.middle_block.0.in_layers.0.bias": "mid_block.resnets.0.norm1.bias", 279 | "model.diffusion_model.middle_block.0.in_layers.2.weight": "mid_block.resnets.0.conv1.weight", 280 | "model.diffusion_model.middle_block.0.in_layers.2.bias": "mid_block.resnets.0.conv1.bias", 281 | "model.diffusion_model.middle_block.0.emb_layers.1.weight": "mid_block.resnets.0.time_emb_proj.weight", 282 | "model.diffusion_model.middle_block.0.emb_layers.1.bias": "mid_block.resnets.0.time_emb_proj.bias", 283 | "model.diffusion_model.middle_block.0.out_layers.0.weight": "mid_block.resnets.0.norm2.weight", 284 | "model.diffusion_model.middle_block.0.out_layers.0.bias": "mid_block.resnets.0.norm2.bias", 285 | "model.diffusion_model.middle_block.0.out_layers.3.weight": "mid_block.resnets.0.conv2.weight", 286 | "model.diffusion_model.middle_block.0.out_layers.3.bias": "mid_block.resnets.0.conv2.bias", 287 | "model.diffusion_model.middle_block.2.in_layers.0.weight": "mid_block.resnets.1.norm1.weight", 288 | "model.diffusion_model.middle_block.2.in_layers.0.bias": "mid_block.resnets.1.norm1.bias", 289 | "model.diffusion_model.middle_block.2.in_layers.2.weight": "mid_block.resnets.1.conv1.weight", 290 | "model.diffusion_model.middle_block.2.in_layers.2.bias": "mid_block.resnets.1.conv1.bias", 291 | "model.diffusion_model.middle_block.2.emb_layers.1.weight": "mid_block.resnets.1.time_emb_proj.weight", 292 | "model.diffusion_model.middle_block.2.emb_layers.1.bias": "mid_block.resnets.1.time_emb_proj.bias", 293 | "model.diffusion_model.middle_block.2.out_layers.0.weight": "mid_block.resnets.1.norm2.weight", 294 | "model.diffusion_model.middle_block.2.out_layers.0.bias": "mid_block.resnets.1.norm2.bias", 295 | "model.diffusion_model.middle_block.2.out_layers.3.weight": "mid_block.resnets.1.conv2.weight", 296 | "model.diffusion_model.middle_block.2.out_layers.3.bias": "mid_block.resnets.1.conv2.bias", 297 | "model.diffusion_model.middle_block.1.norm.weight": "mid_block.attentions.0.norm.weight", 298 | "model.diffusion_model.middle_block.1.norm.bias": "mid_block.attentions.0.norm.bias", 299 | "model.diffusion_model.middle_block.1.proj_in.weight": "mid_block.attentions.0.proj_in.weight", 300 | "model.diffusion_model.middle_block.1.proj_in.bias": "mid_block.attentions.0.proj_in.bias", 301 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight": "mid_block.attentions.0.transformer_blocks.0.attn1.to_q.weight", 302 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_k.weight": "mid_block.attentions.0.transformer_blocks.0.attn1.to_k.weight", 303 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_v.weight": "mid_block.attentions.0.transformer_blocks.0.attn1.to_v.weight", 304 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.weight": "mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 305 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.bias": "mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 306 | "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.weight": "mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 307 | "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.bias": "mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 308 | "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.weight": "mid_block.attentions.0.transformer_blocks.0.ff.net.2.weight", 309 | "model.diffusion_model.middle_block.1.transformer_blocks.0.ff.net.2.bias": "mid_block.attentions.0.transformer_blocks.0.ff.net.2.bias", 310 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_q.weight": "mid_block.attentions.0.transformer_blocks.0.attn2.to_q.weight", 311 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_k.weight": "mid_block.attentions.0.transformer_blocks.0.attn2.to_k.weight", 312 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_v.weight": "mid_block.attentions.0.transformer_blocks.0.attn2.to_v.weight", 313 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.weight": "mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 314 | "model.diffusion_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.bias": "mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 315 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.weight": "mid_block.attentions.0.transformer_blocks.0.norm1.weight", 316 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm1.bias": "mid_block.attentions.0.transformer_blocks.0.norm1.bias", 317 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.weight": "mid_block.attentions.0.transformer_blocks.0.norm2.weight", 318 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm2.bias": "mid_block.attentions.0.transformer_blocks.0.norm2.bias", 319 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.weight": "mid_block.attentions.0.transformer_blocks.0.norm3.weight", 320 | "model.diffusion_model.middle_block.1.transformer_blocks.0.norm3.bias": "mid_block.attentions.0.transformer_blocks.0.norm3.bias", 321 | "model.diffusion_model.middle_block.1.proj_out.weight": "mid_block.attentions.0.proj_out.weight", 322 | "model.diffusion_model.middle_block.1.proj_out.bias": "mid_block.attentions.0.proj_out.bias", 323 | "model.diffusion_model.output_blocks.0.0.in_layers.0.weight": "up_blocks.0.resnets.0.norm1.weight", 324 | "model.diffusion_model.output_blocks.0.0.in_layers.0.bias": "up_blocks.0.resnets.0.norm1.bias", 325 | "model.diffusion_model.output_blocks.0.0.in_layers.2.weight": "up_blocks.0.resnets.0.conv1.weight", 326 | "model.diffusion_model.output_blocks.0.0.in_layers.2.bias": "up_blocks.0.resnets.0.conv1.bias", 327 | "model.diffusion_model.output_blocks.0.0.emb_layers.1.weight": "up_blocks.0.resnets.0.time_emb_proj.weight", 328 | "model.diffusion_model.output_blocks.0.0.emb_layers.1.bias": "up_blocks.0.resnets.0.time_emb_proj.bias", 329 | "model.diffusion_model.output_blocks.0.0.out_layers.0.weight": "up_blocks.0.resnets.0.norm2.weight", 330 | "model.diffusion_model.output_blocks.0.0.out_layers.0.bias": "up_blocks.0.resnets.0.norm2.bias", 331 | "model.diffusion_model.output_blocks.0.0.out_layers.3.weight": "up_blocks.0.resnets.0.conv2.weight", 332 | "model.diffusion_model.output_blocks.0.0.out_layers.3.bias": "up_blocks.0.resnets.0.conv2.bias", 333 | "model.diffusion_model.output_blocks.0.0.skip_connection.weight": "up_blocks.0.resnets.0.conv_shortcut.weight", 334 | "model.diffusion_model.output_blocks.0.0.skip_connection.bias": "up_blocks.0.resnets.0.conv_shortcut.bias", 335 | "model.diffusion_model.output_blocks.1.0.in_layers.0.weight": "up_blocks.0.resnets.1.norm1.weight", 336 | "model.diffusion_model.output_blocks.1.0.in_layers.0.bias": "up_blocks.0.resnets.1.norm1.bias", 337 | "model.diffusion_model.output_blocks.1.0.in_layers.2.weight": "up_blocks.0.resnets.1.conv1.weight", 338 | "model.diffusion_model.output_blocks.1.0.in_layers.2.bias": "up_blocks.0.resnets.1.conv1.bias", 339 | "model.diffusion_model.output_blocks.1.0.emb_layers.1.weight": "up_blocks.0.resnets.1.time_emb_proj.weight", 340 | "model.diffusion_model.output_blocks.1.0.emb_layers.1.bias": "up_blocks.0.resnets.1.time_emb_proj.bias", 341 | "model.diffusion_model.output_blocks.1.0.out_layers.0.weight": "up_blocks.0.resnets.1.norm2.weight", 342 | "model.diffusion_model.output_blocks.1.0.out_layers.0.bias": "up_blocks.0.resnets.1.norm2.bias", 343 | "model.diffusion_model.output_blocks.1.0.out_layers.3.weight": "up_blocks.0.resnets.1.conv2.weight", 344 | "model.diffusion_model.output_blocks.1.0.out_layers.3.bias": "up_blocks.0.resnets.1.conv2.bias", 345 | "model.diffusion_model.output_blocks.1.0.skip_connection.weight": "up_blocks.0.resnets.1.conv_shortcut.weight", 346 | "model.diffusion_model.output_blocks.1.0.skip_connection.bias": "up_blocks.0.resnets.1.conv_shortcut.bias", 347 | "model.diffusion_model.output_blocks.2.0.in_layers.0.weight": "up_blocks.0.resnets.2.norm1.weight", 348 | "model.diffusion_model.output_blocks.2.0.in_layers.0.bias": "up_blocks.0.resnets.2.norm1.bias", 349 | "model.diffusion_model.output_blocks.2.0.in_layers.2.weight": "up_blocks.0.resnets.2.conv1.weight", 350 | "model.diffusion_model.output_blocks.2.0.in_layers.2.bias": "up_blocks.0.resnets.2.conv1.bias", 351 | "model.diffusion_model.output_blocks.2.0.emb_layers.1.weight": "up_blocks.0.resnets.2.time_emb_proj.weight", 352 | "model.diffusion_model.output_blocks.2.0.emb_layers.1.bias": "up_blocks.0.resnets.2.time_emb_proj.bias", 353 | "model.diffusion_model.output_blocks.2.0.out_layers.0.weight": "up_blocks.0.resnets.2.norm2.weight", 354 | "model.diffusion_model.output_blocks.2.0.out_layers.0.bias": "up_blocks.0.resnets.2.norm2.bias", 355 | "model.diffusion_model.output_blocks.2.0.out_layers.3.weight": "up_blocks.0.resnets.2.conv2.weight", 356 | "model.diffusion_model.output_blocks.2.0.out_layers.3.bias": "up_blocks.0.resnets.2.conv2.bias", 357 | "model.diffusion_model.output_blocks.2.0.skip_connection.weight": "up_blocks.0.resnets.2.conv_shortcut.weight", 358 | "model.diffusion_model.output_blocks.2.0.skip_connection.bias": "up_blocks.0.resnets.2.conv_shortcut.bias", 359 | "model.diffusion_model.output_blocks.2.1.conv.weight": "up_blocks.0.upsamplers.0.conv.weight", 360 | "model.diffusion_model.output_blocks.2.1.conv.bias": "up_blocks.0.upsamplers.0.conv.bias", 361 | "model.diffusion_model.output_blocks.3.0.in_layers.0.weight": "up_blocks.1.resnets.0.norm1.weight", 362 | "model.diffusion_model.output_blocks.3.0.in_layers.0.bias": "up_blocks.1.resnets.0.norm1.bias", 363 | "model.diffusion_model.output_blocks.3.0.in_layers.2.weight": "up_blocks.1.resnets.0.conv1.weight", 364 | "model.diffusion_model.output_blocks.3.0.in_layers.2.bias": "up_blocks.1.resnets.0.conv1.bias", 365 | "model.diffusion_model.output_blocks.3.0.emb_layers.1.weight": "up_blocks.1.resnets.0.time_emb_proj.weight", 366 | "model.diffusion_model.output_blocks.3.0.emb_layers.1.bias": "up_blocks.1.resnets.0.time_emb_proj.bias", 367 | "model.diffusion_model.output_blocks.3.0.out_layers.0.weight": "up_blocks.1.resnets.0.norm2.weight", 368 | "model.diffusion_model.output_blocks.3.0.out_layers.0.bias": "up_blocks.1.resnets.0.norm2.bias", 369 | "model.diffusion_model.output_blocks.3.0.out_layers.3.weight": "up_blocks.1.resnets.0.conv2.weight", 370 | "model.diffusion_model.output_blocks.3.0.out_layers.3.bias": "up_blocks.1.resnets.0.conv2.bias", 371 | "model.diffusion_model.output_blocks.3.0.skip_connection.weight": "up_blocks.1.resnets.0.conv_shortcut.weight", 372 | "model.diffusion_model.output_blocks.3.0.skip_connection.bias": "up_blocks.1.resnets.0.conv_shortcut.bias", 373 | "model.diffusion_model.output_blocks.3.1.norm.weight": "up_blocks.1.attentions.0.norm.weight", 374 | "model.diffusion_model.output_blocks.3.1.norm.bias": "up_blocks.1.attentions.0.norm.bias", 375 | "model.diffusion_model.output_blocks.3.1.proj_in.weight": "up_blocks.1.attentions.0.proj_in.weight", 376 | "model.diffusion_model.output_blocks.3.1.proj_in.bias": "up_blocks.1.attentions.0.proj_in.bias", 377 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.weight", 378 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.weight", 379 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.weight", 380 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 381 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 382 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 383 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 384 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.weight", 385 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.bias", 386 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.weight", 387 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.weight", 388 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.weight", 389 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 390 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 391 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.weight": "up_blocks.1.attentions.0.transformer_blocks.0.norm1.weight", 392 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm1.bias": "up_blocks.1.attentions.0.transformer_blocks.0.norm1.bias", 393 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.weight": "up_blocks.1.attentions.0.transformer_blocks.0.norm2.weight", 394 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm2.bias": "up_blocks.1.attentions.0.transformer_blocks.0.norm2.bias", 395 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.weight": "up_blocks.1.attentions.0.transformer_blocks.0.norm3.weight", 396 | "model.diffusion_model.output_blocks.3.1.transformer_blocks.0.norm3.bias": "up_blocks.1.attentions.0.transformer_blocks.0.norm3.bias", 397 | "model.diffusion_model.output_blocks.3.1.proj_out.weight": "up_blocks.1.attentions.0.proj_out.weight", 398 | "model.diffusion_model.output_blocks.3.1.proj_out.bias": "up_blocks.1.attentions.0.proj_out.bias", 399 | "model.diffusion_model.output_blocks.4.0.in_layers.0.weight": "up_blocks.1.resnets.1.norm1.weight", 400 | "model.diffusion_model.output_blocks.4.0.in_layers.0.bias": "up_blocks.1.resnets.1.norm1.bias", 401 | "model.diffusion_model.output_blocks.4.0.in_layers.2.weight": "up_blocks.1.resnets.1.conv1.weight", 402 | "model.diffusion_model.output_blocks.4.0.in_layers.2.bias": "up_blocks.1.resnets.1.conv1.bias", 403 | "model.diffusion_model.output_blocks.4.0.emb_layers.1.weight": "up_blocks.1.resnets.1.time_emb_proj.weight", 404 | "model.diffusion_model.output_blocks.4.0.emb_layers.1.bias": "up_blocks.1.resnets.1.time_emb_proj.bias", 405 | "model.diffusion_model.output_blocks.4.0.out_layers.0.weight": "up_blocks.1.resnets.1.norm2.weight", 406 | "model.diffusion_model.output_blocks.4.0.out_layers.0.bias": "up_blocks.1.resnets.1.norm2.bias", 407 | "model.diffusion_model.output_blocks.4.0.out_layers.3.weight": "up_blocks.1.resnets.1.conv2.weight", 408 | "model.diffusion_model.output_blocks.4.0.out_layers.3.bias": "up_blocks.1.resnets.1.conv2.bias", 409 | "model.diffusion_model.output_blocks.4.0.skip_connection.weight": "up_blocks.1.resnets.1.conv_shortcut.weight", 410 | "model.diffusion_model.output_blocks.4.0.skip_connection.bias": "up_blocks.1.resnets.1.conv_shortcut.bias", 411 | "model.diffusion_model.output_blocks.4.1.norm.weight": "up_blocks.1.attentions.1.norm.weight", 412 | "model.diffusion_model.output_blocks.4.1.norm.bias": "up_blocks.1.attentions.1.norm.bias", 413 | "model.diffusion_model.output_blocks.4.1.proj_in.weight": "up_blocks.1.attentions.1.proj_in.weight", 414 | "model.diffusion_model.output_blocks.4.1.proj_in.bias": "up_blocks.1.attentions.1.proj_in.bias", 415 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.weight", 416 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.weight", 417 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.weight", 418 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 419 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 420 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 421 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 422 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.weight", 423 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.bias", 424 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.weight", 425 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.weight", 426 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.weight", 427 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 428 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 429 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.weight": "up_blocks.1.attentions.1.transformer_blocks.0.norm1.weight", 430 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm1.bias": "up_blocks.1.attentions.1.transformer_blocks.0.norm1.bias", 431 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.weight": "up_blocks.1.attentions.1.transformer_blocks.0.norm2.weight", 432 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm2.bias": "up_blocks.1.attentions.1.transformer_blocks.0.norm2.bias", 433 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.weight": "up_blocks.1.attentions.1.transformer_blocks.0.norm3.weight", 434 | "model.diffusion_model.output_blocks.4.1.transformer_blocks.0.norm3.bias": "up_blocks.1.attentions.1.transformer_blocks.0.norm3.bias", 435 | "model.diffusion_model.output_blocks.4.1.proj_out.weight": "up_blocks.1.attentions.1.proj_out.weight", 436 | "model.diffusion_model.output_blocks.4.1.proj_out.bias": "up_blocks.1.attentions.1.proj_out.bias", 437 | "model.diffusion_model.output_blocks.5.0.in_layers.0.weight": "up_blocks.1.resnets.2.norm1.weight", 438 | "model.diffusion_model.output_blocks.5.0.in_layers.0.bias": "up_blocks.1.resnets.2.norm1.bias", 439 | "model.diffusion_model.output_blocks.5.0.in_layers.2.weight": "up_blocks.1.resnets.2.conv1.weight", 440 | "model.diffusion_model.output_blocks.5.0.in_layers.2.bias": "up_blocks.1.resnets.2.conv1.bias", 441 | "model.diffusion_model.output_blocks.5.0.emb_layers.1.weight": "up_blocks.1.resnets.2.time_emb_proj.weight", 442 | "model.diffusion_model.output_blocks.5.0.emb_layers.1.bias": "up_blocks.1.resnets.2.time_emb_proj.bias", 443 | "model.diffusion_model.output_blocks.5.0.out_layers.0.weight": "up_blocks.1.resnets.2.norm2.weight", 444 | "model.diffusion_model.output_blocks.5.0.out_layers.0.bias": "up_blocks.1.resnets.2.norm2.bias", 445 | "model.diffusion_model.output_blocks.5.0.out_layers.3.weight": "up_blocks.1.resnets.2.conv2.weight", 446 | "model.diffusion_model.output_blocks.5.0.out_layers.3.bias": "up_blocks.1.resnets.2.conv2.bias", 447 | "model.diffusion_model.output_blocks.5.0.skip_connection.weight": "up_blocks.1.resnets.2.conv_shortcut.weight", 448 | "model.diffusion_model.output_blocks.5.0.skip_connection.bias": "up_blocks.1.resnets.2.conv_shortcut.bias", 449 | "model.diffusion_model.output_blocks.5.2.conv.weight": "up_blocks.1.upsamplers.0.conv.weight", 450 | "model.diffusion_model.output_blocks.5.2.conv.bias": "up_blocks.1.upsamplers.0.conv.bias", 451 | "model.diffusion_model.output_blocks.5.1.norm.weight": "up_blocks.1.attentions.2.norm.weight", 452 | "model.diffusion_model.output_blocks.5.1.norm.bias": "up_blocks.1.attentions.2.norm.bias", 453 | "model.diffusion_model.output_blocks.5.1.proj_in.weight": "up_blocks.1.attentions.2.proj_in.weight", 454 | "model.diffusion_model.output_blocks.5.1.proj_in.bias": "up_blocks.1.attentions.2.proj_in.bias", 455 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q.weight", 456 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k.weight", 457 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v.weight", 458 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0.weight", 459 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0.bias", 460 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj.weight", 461 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj.bias", 462 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2.weight", 463 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2.bias", 464 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q.weight", 465 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k.weight", 466 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v.weight", 467 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0.weight", 468 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0.bias", 469 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.weight": "up_blocks.1.attentions.2.transformer_blocks.0.norm1.weight", 470 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm1.bias": "up_blocks.1.attentions.2.transformer_blocks.0.norm1.bias", 471 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.weight": "up_blocks.1.attentions.2.transformer_blocks.0.norm2.weight", 472 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm2.bias": "up_blocks.1.attentions.2.transformer_blocks.0.norm2.bias", 473 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.weight": "up_blocks.1.attentions.2.transformer_blocks.0.norm3.weight", 474 | "model.diffusion_model.output_blocks.5.1.transformer_blocks.0.norm3.bias": "up_blocks.1.attentions.2.transformer_blocks.0.norm3.bias", 475 | "model.diffusion_model.output_blocks.5.1.proj_out.weight": "up_blocks.1.attentions.2.proj_out.weight", 476 | "model.diffusion_model.output_blocks.5.1.proj_out.bias": "up_blocks.1.attentions.2.proj_out.bias", 477 | "model.diffusion_model.output_blocks.6.0.in_layers.0.weight": "up_blocks.2.resnets.0.norm1.weight", 478 | "model.diffusion_model.output_blocks.6.0.in_layers.0.bias": "up_blocks.2.resnets.0.norm1.bias", 479 | "model.diffusion_model.output_blocks.6.0.in_layers.2.weight": "up_blocks.2.resnets.0.conv1.weight", 480 | "model.diffusion_model.output_blocks.6.0.in_layers.2.bias": "up_blocks.2.resnets.0.conv1.bias", 481 | "model.diffusion_model.output_blocks.6.0.emb_layers.1.weight": "up_blocks.2.resnets.0.time_emb_proj.weight", 482 | "model.diffusion_model.output_blocks.6.0.emb_layers.1.bias": "up_blocks.2.resnets.0.time_emb_proj.bias", 483 | "model.diffusion_model.output_blocks.6.0.out_layers.0.weight": "up_blocks.2.resnets.0.norm2.weight", 484 | "model.diffusion_model.output_blocks.6.0.out_layers.0.bias": "up_blocks.2.resnets.0.norm2.bias", 485 | "model.diffusion_model.output_blocks.6.0.out_layers.3.weight": "up_blocks.2.resnets.0.conv2.weight", 486 | "model.diffusion_model.output_blocks.6.0.out_layers.3.bias": "up_blocks.2.resnets.0.conv2.bias", 487 | "model.diffusion_model.output_blocks.6.0.skip_connection.weight": "up_blocks.2.resnets.0.conv_shortcut.weight", 488 | "model.diffusion_model.output_blocks.6.0.skip_connection.bias": "up_blocks.2.resnets.0.conv_shortcut.bias", 489 | "model.diffusion_model.output_blocks.6.1.norm.weight": "up_blocks.2.attentions.0.norm.weight", 490 | "model.diffusion_model.output_blocks.6.1.norm.bias": "up_blocks.2.attentions.0.norm.bias", 491 | "model.diffusion_model.output_blocks.6.1.proj_in.weight": "up_blocks.2.attentions.0.proj_in.weight", 492 | "model.diffusion_model.output_blocks.6.1.proj_in.bias": "up_blocks.2.attentions.0.proj_in.bias", 493 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.weight", 494 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.weight", 495 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.weight", 496 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 497 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 498 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 499 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 500 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.weight", 501 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.bias", 502 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.weight", 503 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.weight", 504 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.weight", 505 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 506 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 507 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.weight": "up_blocks.2.attentions.0.transformer_blocks.0.norm1.weight", 508 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm1.bias": "up_blocks.2.attentions.0.transformer_blocks.0.norm1.bias", 509 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.weight": "up_blocks.2.attentions.0.transformer_blocks.0.norm2.weight", 510 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm2.bias": "up_blocks.2.attentions.0.transformer_blocks.0.norm2.bias", 511 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.weight": "up_blocks.2.attentions.0.transformer_blocks.0.norm3.weight", 512 | "model.diffusion_model.output_blocks.6.1.transformer_blocks.0.norm3.bias": "up_blocks.2.attentions.0.transformer_blocks.0.norm3.bias", 513 | "model.diffusion_model.output_blocks.6.1.proj_out.weight": "up_blocks.2.attentions.0.proj_out.weight", 514 | "model.diffusion_model.output_blocks.6.1.proj_out.bias": "up_blocks.2.attentions.0.proj_out.bias", 515 | "model.diffusion_model.output_blocks.7.0.in_layers.0.weight": "up_blocks.2.resnets.1.norm1.weight", 516 | "model.diffusion_model.output_blocks.7.0.in_layers.0.bias": "up_blocks.2.resnets.1.norm1.bias", 517 | "model.diffusion_model.output_blocks.7.0.in_layers.2.weight": "up_blocks.2.resnets.1.conv1.weight", 518 | "model.diffusion_model.output_blocks.7.0.in_layers.2.bias": "up_blocks.2.resnets.1.conv1.bias", 519 | "model.diffusion_model.output_blocks.7.0.emb_layers.1.weight": "up_blocks.2.resnets.1.time_emb_proj.weight", 520 | "model.diffusion_model.output_blocks.7.0.emb_layers.1.bias": "up_blocks.2.resnets.1.time_emb_proj.bias", 521 | "model.diffusion_model.output_blocks.7.0.out_layers.0.weight": "up_blocks.2.resnets.1.norm2.weight", 522 | "model.diffusion_model.output_blocks.7.0.out_layers.0.bias": "up_blocks.2.resnets.1.norm2.bias", 523 | "model.diffusion_model.output_blocks.7.0.out_layers.3.weight": "up_blocks.2.resnets.1.conv2.weight", 524 | "model.diffusion_model.output_blocks.7.0.out_layers.3.bias": "up_blocks.2.resnets.1.conv2.bias", 525 | "model.diffusion_model.output_blocks.7.0.skip_connection.weight": "up_blocks.2.resnets.1.conv_shortcut.weight", 526 | "model.diffusion_model.output_blocks.7.0.skip_connection.bias": "up_blocks.2.resnets.1.conv_shortcut.bias", 527 | "model.diffusion_model.output_blocks.7.1.norm.weight": "up_blocks.2.attentions.1.norm.weight", 528 | "model.diffusion_model.output_blocks.7.1.norm.bias": "up_blocks.2.attentions.1.norm.bias", 529 | "model.diffusion_model.output_blocks.7.1.proj_in.weight": "up_blocks.2.attentions.1.proj_in.weight", 530 | "model.diffusion_model.output_blocks.7.1.proj_in.bias": "up_blocks.2.attentions.1.proj_in.bias", 531 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.weight", 532 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.weight", 533 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.weight", 534 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 535 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 536 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 537 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 538 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.weight", 539 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.bias", 540 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.weight", 541 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.weight", 542 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.weight", 543 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 544 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 545 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.weight": "up_blocks.2.attentions.1.transformer_blocks.0.norm1.weight", 546 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm1.bias": "up_blocks.2.attentions.1.transformer_blocks.0.norm1.bias", 547 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.weight": "up_blocks.2.attentions.1.transformer_blocks.0.norm2.weight", 548 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm2.bias": "up_blocks.2.attentions.1.transformer_blocks.0.norm2.bias", 549 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.weight": "up_blocks.2.attentions.1.transformer_blocks.0.norm3.weight", 550 | "model.diffusion_model.output_blocks.7.1.transformer_blocks.0.norm3.bias": "up_blocks.2.attentions.1.transformer_blocks.0.norm3.bias", 551 | "model.diffusion_model.output_blocks.7.1.proj_out.weight": "up_blocks.2.attentions.1.proj_out.weight", 552 | "model.diffusion_model.output_blocks.7.1.proj_out.bias": "up_blocks.2.attentions.1.proj_out.bias", 553 | "model.diffusion_model.output_blocks.8.0.in_layers.0.weight": "up_blocks.2.resnets.2.norm1.weight", 554 | "model.diffusion_model.output_blocks.8.0.in_layers.0.bias": "up_blocks.2.resnets.2.norm1.bias", 555 | "model.diffusion_model.output_blocks.8.0.in_layers.2.weight": "up_blocks.2.resnets.2.conv1.weight", 556 | "model.diffusion_model.output_blocks.8.0.in_layers.2.bias": "up_blocks.2.resnets.2.conv1.bias", 557 | "model.diffusion_model.output_blocks.8.0.emb_layers.1.weight": "up_blocks.2.resnets.2.time_emb_proj.weight", 558 | "model.diffusion_model.output_blocks.8.0.emb_layers.1.bias": "up_blocks.2.resnets.2.time_emb_proj.bias", 559 | "model.diffusion_model.output_blocks.8.0.out_layers.0.weight": "up_blocks.2.resnets.2.norm2.weight", 560 | "model.diffusion_model.output_blocks.8.0.out_layers.0.bias": "up_blocks.2.resnets.2.norm2.bias", 561 | "model.diffusion_model.output_blocks.8.0.out_layers.3.weight": "up_blocks.2.resnets.2.conv2.weight", 562 | "model.diffusion_model.output_blocks.8.0.out_layers.3.bias": "up_blocks.2.resnets.2.conv2.bias", 563 | "model.diffusion_model.output_blocks.8.0.skip_connection.weight": "up_blocks.2.resnets.2.conv_shortcut.weight", 564 | "model.diffusion_model.output_blocks.8.0.skip_connection.bias": "up_blocks.2.resnets.2.conv_shortcut.bias", 565 | "model.diffusion_model.output_blocks.8.2.conv.weight": "up_blocks.2.upsamplers.0.conv.weight", 566 | "model.diffusion_model.output_blocks.8.2.conv.bias": "up_blocks.2.upsamplers.0.conv.bias", 567 | "model.diffusion_model.output_blocks.8.1.norm.weight": "up_blocks.2.attentions.2.norm.weight", 568 | "model.diffusion_model.output_blocks.8.1.norm.bias": "up_blocks.2.attentions.2.norm.bias", 569 | "model.diffusion_model.output_blocks.8.1.proj_in.weight": "up_blocks.2.attentions.2.proj_in.weight", 570 | "model.diffusion_model.output_blocks.8.1.proj_in.bias": "up_blocks.2.attentions.2.proj_in.bias", 571 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q.weight", 572 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k.weight", 573 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v.weight", 574 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0.weight", 575 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0.bias", 576 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj.weight", 577 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj.bias", 578 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2.weight", 579 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2.bias", 580 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q.weight", 581 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k.weight", 582 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v.weight", 583 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0.weight", 584 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0.bias", 585 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.weight": "up_blocks.2.attentions.2.transformer_blocks.0.norm1.weight", 586 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm1.bias": "up_blocks.2.attentions.2.transformer_blocks.0.norm1.bias", 587 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.weight": "up_blocks.2.attentions.2.transformer_blocks.0.norm2.weight", 588 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm2.bias": "up_blocks.2.attentions.2.transformer_blocks.0.norm2.bias", 589 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.weight": "up_blocks.2.attentions.2.transformer_blocks.0.norm3.weight", 590 | "model.diffusion_model.output_blocks.8.1.transformer_blocks.0.norm3.bias": "up_blocks.2.attentions.2.transformer_blocks.0.norm3.bias", 591 | "model.diffusion_model.output_blocks.8.1.proj_out.weight": "up_blocks.2.attentions.2.proj_out.weight", 592 | "model.diffusion_model.output_blocks.8.1.proj_out.bias": "up_blocks.2.attentions.2.proj_out.bias", 593 | "model.diffusion_model.output_blocks.9.0.in_layers.0.weight": "up_blocks.3.resnets.0.norm1.weight", 594 | "model.diffusion_model.output_blocks.9.0.in_layers.0.bias": "up_blocks.3.resnets.0.norm1.bias", 595 | "model.diffusion_model.output_blocks.9.0.in_layers.2.weight": "up_blocks.3.resnets.0.conv1.weight", 596 | "model.diffusion_model.output_blocks.9.0.in_layers.2.bias": "up_blocks.3.resnets.0.conv1.bias", 597 | "model.diffusion_model.output_blocks.9.0.emb_layers.1.weight": "up_blocks.3.resnets.0.time_emb_proj.weight", 598 | "model.diffusion_model.output_blocks.9.0.emb_layers.1.bias": "up_blocks.3.resnets.0.time_emb_proj.bias", 599 | "model.diffusion_model.output_blocks.9.0.out_layers.0.weight": "up_blocks.3.resnets.0.norm2.weight", 600 | "model.diffusion_model.output_blocks.9.0.out_layers.0.bias": "up_blocks.3.resnets.0.norm2.bias", 601 | "model.diffusion_model.output_blocks.9.0.out_layers.3.weight": "up_blocks.3.resnets.0.conv2.weight", 602 | "model.diffusion_model.output_blocks.9.0.out_layers.3.bias": "up_blocks.3.resnets.0.conv2.bias", 603 | "model.diffusion_model.output_blocks.9.0.skip_connection.weight": "up_blocks.3.resnets.0.conv_shortcut.weight", 604 | "model.diffusion_model.output_blocks.9.0.skip_connection.bias": "up_blocks.3.resnets.0.conv_shortcut.bias", 605 | "model.diffusion_model.output_blocks.9.1.norm.weight": "up_blocks.3.attentions.0.norm.weight", 606 | "model.diffusion_model.output_blocks.9.1.norm.bias": "up_blocks.3.attentions.0.norm.bias", 607 | "model.diffusion_model.output_blocks.9.1.proj_in.weight": "up_blocks.3.attentions.0.proj_in.weight", 608 | "model.diffusion_model.output_blocks.9.1.proj_in.bias": "up_blocks.3.attentions.0.proj_in.bias", 609 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q.weight", 610 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k.weight", 611 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v.weight", 612 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0.weight", 613 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0.bias", 614 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj.weight", 615 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj.bias", 616 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2.weight", 617 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2.bias", 618 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q.weight", 619 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k.weight", 620 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v.weight", 621 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0.weight", 622 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0.bias", 623 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.weight": "up_blocks.3.attentions.0.transformer_blocks.0.norm1.weight", 624 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm1.bias": "up_blocks.3.attentions.0.transformer_blocks.0.norm1.bias", 625 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.weight": "up_blocks.3.attentions.0.transformer_blocks.0.norm2.weight", 626 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm2.bias": "up_blocks.3.attentions.0.transformer_blocks.0.norm2.bias", 627 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight": "up_blocks.3.attentions.0.transformer_blocks.0.norm3.weight", 628 | "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.bias": "up_blocks.3.attentions.0.transformer_blocks.0.norm3.bias", 629 | "model.diffusion_model.output_blocks.9.1.proj_out.weight": "up_blocks.3.attentions.0.proj_out.weight", 630 | "model.diffusion_model.output_blocks.9.1.proj_out.bias": "up_blocks.3.attentions.0.proj_out.bias", 631 | "model.diffusion_model.output_blocks.10.0.in_layers.0.weight": "up_blocks.3.resnets.1.norm1.weight", 632 | "model.diffusion_model.output_blocks.10.0.in_layers.0.bias": "up_blocks.3.resnets.1.norm1.bias", 633 | "model.diffusion_model.output_blocks.10.0.in_layers.2.weight": "up_blocks.3.resnets.1.conv1.weight", 634 | "model.diffusion_model.output_blocks.10.0.in_layers.2.bias": "up_blocks.3.resnets.1.conv1.bias", 635 | "model.diffusion_model.output_blocks.10.0.emb_layers.1.weight": "up_blocks.3.resnets.1.time_emb_proj.weight", 636 | "model.diffusion_model.output_blocks.10.0.emb_layers.1.bias": "up_blocks.3.resnets.1.time_emb_proj.bias", 637 | "model.diffusion_model.output_blocks.10.0.out_layers.0.weight": "up_blocks.3.resnets.1.norm2.weight", 638 | "model.diffusion_model.output_blocks.10.0.out_layers.0.bias": "up_blocks.3.resnets.1.norm2.bias", 639 | "model.diffusion_model.output_blocks.10.0.out_layers.3.weight": "up_blocks.3.resnets.1.conv2.weight", 640 | "model.diffusion_model.output_blocks.10.0.out_layers.3.bias": "up_blocks.3.resnets.1.conv2.bias", 641 | "model.diffusion_model.output_blocks.10.0.skip_connection.weight": "up_blocks.3.resnets.1.conv_shortcut.weight", 642 | "model.diffusion_model.output_blocks.10.0.skip_connection.bias": "up_blocks.3.resnets.1.conv_shortcut.bias", 643 | "model.diffusion_model.output_blocks.10.1.norm.weight": "up_blocks.3.attentions.1.norm.weight", 644 | "model.diffusion_model.output_blocks.10.1.norm.bias": "up_blocks.3.attentions.1.norm.bias", 645 | "model.diffusion_model.output_blocks.10.1.proj_in.weight": "up_blocks.3.attentions.1.proj_in.weight", 646 | "model.diffusion_model.output_blocks.10.1.proj_in.bias": "up_blocks.3.attentions.1.proj_in.bias", 647 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q.weight", 648 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k.weight", 649 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v.weight", 650 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0.weight", 651 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0.bias", 652 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj.weight", 653 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj.bias", 654 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2.weight", 655 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2.bias", 656 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q.weight", 657 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k.weight", 658 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v.weight", 659 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0.weight", 660 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0.bias", 661 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.weight": "up_blocks.3.attentions.1.transformer_blocks.0.norm1.weight", 662 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm1.bias": "up_blocks.3.attentions.1.transformer_blocks.0.norm1.bias", 663 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.weight": "up_blocks.3.attentions.1.transformer_blocks.0.norm2.weight", 664 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm2.bias": "up_blocks.3.attentions.1.transformer_blocks.0.norm2.bias", 665 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.weight": "up_blocks.3.attentions.1.transformer_blocks.0.norm3.weight", 666 | "model.diffusion_model.output_blocks.10.1.transformer_blocks.0.norm3.bias": "up_blocks.3.attentions.1.transformer_blocks.0.norm3.bias", 667 | "model.diffusion_model.output_blocks.10.1.proj_out.weight": "up_blocks.3.attentions.1.proj_out.weight", 668 | "model.diffusion_model.output_blocks.10.1.proj_out.bias": "up_blocks.3.attentions.1.proj_out.bias", 669 | "model.diffusion_model.output_blocks.11.0.in_layers.0.weight": "up_blocks.3.resnets.2.norm1.weight", 670 | "model.diffusion_model.output_blocks.11.0.in_layers.0.bias": "up_blocks.3.resnets.2.norm1.bias", 671 | "model.diffusion_model.output_blocks.11.0.in_layers.2.weight": "up_blocks.3.resnets.2.conv1.weight", 672 | "model.diffusion_model.output_blocks.11.0.in_layers.2.bias": "up_blocks.3.resnets.2.conv1.bias", 673 | "model.diffusion_model.output_blocks.11.0.emb_layers.1.weight": "up_blocks.3.resnets.2.time_emb_proj.weight", 674 | "model.diffusion_model.output_blocks.11.0.emb_layers.1.bias": "up_blocks.3.resnets.2.time_emb_proj.bias", 675 | "model.diffusion_model.output_blocks.11.0.out_layers.0.weight": "up_blocks.3.resnets.2.norm2.weight", 676 | "model.diffusion_model.output_blocks.11.0.out_layers.0.bias": "up_blocks.3.resnets.2.norm2.bias", 677 | "model.diffusion_model.output_blocks.11.0.out_layers.3.weight": "up_blocks.3.resnets.2.conv2.weight", 678 | "model.diffusion_model.output_blocks.11.0.out_layers.3.bias": "up_blocks.3.resnets.2.conv2.bias", 679 | "model.diffusion_model.output_blocks.11.0.skip_connection.weight": "up_blocks.3.resnets.2.conv_shortcut.weight", 680 | "model.diffusion_model.output_blocks.11.0.skip_connection.bias": "up_blocks.3.resnets.2.conv_shortcut.bias", 681 | "model.diffusion_model.output_blocks.11.1.norm.weight": "up_blocks.3.attentions.2.norm.weight", 682 | "model.diffusion_model.output_blocks.11.1.norm.bias": "up_blocks.3.attentions.2.norm.bias", 683 | "model.diffusion_model.output_blocks.11.1.proj_in.weight": "up_blocks.3.attentions.2.proj_in.weight", 684 | "model.diffusion_model.output_blocks.11.1.proj_in.bias": "up_blocks.3.attentions.2.proj_in.bias", 685 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_q.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q.weight", 686 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_k.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k.weight", 687 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_v.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v.weight", 688 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0.weight", 689 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn1.to_out.0.bias": "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0.bias", 690 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.weight": "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj.weight", 691 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.0.proj.bias": "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj.bias", 692 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.weight": "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2.weight", 693 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff.net.2.bias": "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2.bias", 694 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_q.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q.weight", 695 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_k.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k.weight", 696 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v.weight", 697 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.weight": "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0.weight", 698 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_out.0.bias": "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0.bias", 699 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.weight": "up_blocks.3.attentions.2.transformer_blocks.0.norm1.weight", 700 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias": "up_blocks.3.attentions.2.transformer_blocks.0.norm1.bias", 701 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.weight": "up_blocks.3.attentions.2.transformer_blocks.0.norm2.weight", 702 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm2.bias": "up_blocks.3.attentions.2.transformer_blocks.0.norm2.bias", 703 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.weight": "up_blocks.3.attentions.2.transformer_blocks.0.norm3.weight", 704 | "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm3.bias": "up_blocks.3.attentions.2.transformer_blocks.0.norm3.bias", 705 | "model.diffusion_model.output_blocks.11.1.proj_out.weight": "up_blocks.3.attentions.2.proj_out.weight", 706 | "model.diffusion_model.output_blocks.11.1.proj_out.bias": "up_blocks.3.attentions.2.proj_out.bias" 707 | } 708 | 709 | 710 | class StableDiffusionPipelineStripped(DiffusionPipeline): 711 | def __init__(self, unet=None, *args, **kwargs): 712 | print("got unet", len(args), kwargs.keys()) 713 | self.unet = unet 714 | 715 | 716 | def convert_diff_to_sd(diffusers_model_path: str, base_ckpt_path: str, output_ckpt_path: str, 717 | overwrite=False, huggingface_use_auth_token=None): 718 | if not overwrite and os.path.exists(output_ckpt_path): 719 | raise ValueError("output_ckpt_path exists already", output_ckpt_path) 720 | 721 | print(f"loading diff model from {diffusers_model_path!r}") 722 | try: 723 | diff_pipe = StableDiffusionPipeline.from_pretrained(diffusers_model_path, 724 | use_auth_token=huggingface_use_auth_token) 725 | except Exception as ex: 726 | if "required positional arguments" in str(ex): 727 | print("load error, trying loading stripped version", ex) 728 | diff_pipe = StableDiffusionPipelineStripped.from_pretrained(diffusers_model_path, 729 | use_auth_token=huggingface_use_auth_token) 730 | else: 731 | raise 732 | 733 | print("loading diff model done!") 734 | 735 | diff_pipe_unet_sd = diff_pipe.unet.state_dict() 736 | print("diff_pipe_unet_sd done") 737 | 738 | print(f"loading sd ckpt from {base_ckpt_path!r}") 739 | org_model = torch.load(base_ckpt_path) 740 | org_sd = org_model["state_dict"] 741 | print(f"loading sd ckpt done!") 742 | 743 | for ckpt_key, diff_key in KeyMap.items(): 744 | org_sd[ckpt_key] = diff_pipe_unet_sd[diff_key] 745 | 746 | print(f"saving converted unet to {output_ckpt_path!r}") 747 | torch.save(org_model, output_ckpt_path) 748 | print("done") 749 | 750 | 751 | if __name__ == "__main__": 752 | def setup(): 753 | parser = argparse.ArgumentParser() 754 | parser.add_argument("diffusers_model") 755 | # parser.add_argument("config_path") 756 | parser.add_argument("base_ckpt_path") 757 | parser.add_argument("output_ckpt_path") 758 | parser.add_argument("-u", "--use_huggingface_auth_token") 759 | parser.add_argument("-o", "--overwrite", action="store_true") 760 | args = parser.parse_args() 761 | convert_diff_to_sd(args.diffusers_model, args.base_ckpt_path, args.output_ckpt_path, 762 | overwrite=args.overwrite, huggingface_use_auth_token=args.use_huggingface_auth_token) 763 | 764 | 765 | setup() 766 | --------------------------------------------------------------------------------