├── README.md
├── __init__.py
├── diffusers_helper
    ├── bucket_tools.py
    ├── clip_vision.py
    ├── dit_common.py
    ├── gradio
    │   ├── __pycache__
    │   │   └── progress_bar.cpython-310.pyc
    │   └── progress_bar.py
    ├── hf_login.py
    ├── hunyuan.py
    ├── k_diffusion
    │   ├── __pycache__
    │   │   ├── uni_pc_fm.cpython-310.pyc
    │   │   ├── uni_pc_fm.cpython-312.pyc
    │   │   ├── wrapper.cpython-310.pyc
    │   │   └── wrapper.cpython-312.pyc
    │   ├── uni_pc_fm.py
    │   └── wrapper.py
    ├── memory.py
    ├── models
    │   ├── __pycache__
    │   │   ├── hunyuan_video_packed.cpython-310.pyc
    │   │   └── hunyuan_video_packed.cpython-312.pyc
    │   └── hunyuan_video_packed.py
    ├── pipelines
    │   ├── __pycache__
    │   │   ├── k_diffusion_hunyuan.cpython-310.pyc
    │   │   └── k_diffusion_hunyuan.cpython-312.pyc
    │   └── k_diffusion_hunyuan.py
    ├── thread_utils.py
    └── utils.py
├── examples
    ├── FramePack_endimage.json
    └── FramePack_regular.json
├── nodes.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
  1 | # FramePack for ComfyUI
  2 | 
  3 | **20250506 Update:** Added support for `FramePack_F1`.
  4 | - **Download F1 Workflow (English)**: [https://www.runninghub.ai/post/1919141028262252546](https://www.runninghub.ai/post/1919141028262252546)
  5 | - **Download F1 Workflow (中文)**: [https://www.runninghub.cn/post/1919141028262252546](https://www.runninghub.cn/post/1919141028262252546)
  6 | 
  7 | **20250421 Update:** Added support for first/last frame image-to-video generation from TTPlanetPig  
  8 | [TTPlanetPig](https://github.com/TTPlanetPig) https://github.com/lllyasviel/FramePack/pull/167 
  9 | 
 10 | ## Online Access
 11 | You can access RunningHub online to use this plugin and models for free:
 12 | ### English Version
 13 | - **Run & Download Workflow**:  
 14 |   [https://www.runninghub.ai/post/1912930457355517954](https://www.runninghub.ai/post/1912930457355517954)
 15 | ### 中文版本
 16 | - **运行并下载工作流**:  
 17 |   [https://www.runninghub.cn/post/1912930457355517954](https://www.runninghub.cn/post/1912930457355517954)
 18 | 
 19 | ## Features  
 20 | This is a simple implementation of https://github.com/lllyasviel/FramePack. If there are any advantages, they would be:  
 21 | - Better automatic adaptation for 24GB GPUs, enabling higher resolution processing whenever possible.  
 22 | - The entire workflow requires no parameter adjustments, making it extremely user-friendly.  
 23 | 
 24 | 
 25 | 
 26 | 
 27 | # Model Download Guide
 28 | 
 29 | ## Choose a Download Method (Pick One)
 30 | 
 31 | 1. **Download via Cloud Storage (for users in China)**
 32 |    - [T8模型包] (https://pan.quark.cn/s/9669ce6c7356)
 33 | 2. **One-Click Download with Python Script**
 34 |    ```python
 35 |    from huggingface_hub import snapshot_download
 36 | 
 37 |    # Download HunyuanVideo model
 38 |    snapshot_download(
 39 |        repo_id="hunyuanvideo-community/HunyuanVideo",
 40 |        local_dir="HunyuanVideo",
 41 |        ignore_patterns=["transformer/*", "*.git*", "*.log*", "*.md"],
 42 |        local_dir_use_symlinks=False
 43 |    )
 44 | 
 45 |    # Download flux_redux_bfl model
 46 |    snapshot_download(
 47 |        repo_id="lllyasviel/flux_redux_bfl",
 48 |        local_dir="flux_redux_bfl",
 49 |        ignore_patterns=["*.git*", "*.log*", "*.md"],
 50 |        local_dir_use_symlinks=False
 51 |    )
 52 | 
 53 |    # Download FramePackI2V_HY model
 54 |    snapshot_download(
 55 |        repo_id="lllyasviel/FramePackI2V_HY",
 56 |        local_dir="FramePackI2V_HY",
 57 |        ignore_patterns=["*.git*", "*.log*", "*.md"],
 58 |        local_dir_use_symlinks=False
 59 |    )
 60 | 
 61 |    # Download FramePackF1_HY model
 62 |    snapshot_download(
 63 |        repo_id="lllyasviel/FramePack_F1_I2V_HY_20250503",
 64 |        local_dir="FramePackF1_HY",
 65 |        ignore_patterns=["transformer/*", "*.git*", "*.log*", "*.md"],
 66 |        local_dir_use_symlinks=False
 67 |    )
 68 | 
 69 | 3. **Manual Download**
 70 |    - HunyuanVideo: [HuggingFace Link](https://huggingface.co/hunyuanvideo-community/HunyuanVideo/tree/main)
 71 |    - Flux Redux BFL: [HuggingFace Link](https://huggingface.co/lllyasviel/flux_redux_bfl/tree/main)
 72 |    - FramePackI2V: [HuggingFace Link](https://huggingface.co/lllyasviel/FramePackI2V_HY/tree/main)
 73 |    - FramePackF1_HY: [HuggingFace Link](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503/tree/main)
 74 | 
 75 | 4. **File Structure After Download**
 76 | ```
 77 | comfyui/models/
 78 | FramePackF1_HY
 79 | ├── config.json
 80 | ├── diffusion_pytorch_model-00001-of-00003.safetensors
 81 | ├── diffusion_pytorch_model-00002-of-00003.safetensors
 82 | ├── diffusion_pytorch_model-00003-of-00003.safetensors
 83 | ├── diffusion_pytorch_model.safetensors.index.json
 84 | └── down.py
 85 | FramePackI2V_HY
 86 | ├── config.json
 87 | ├── diffusion_pytorch_model-00001-of-00003.safetensors
 88 | ├── diffusion_pytorch_model-00002-of-00003.safetensors
 89 | ├── diffusion_pytorch_model-00003-of-00003.safetensors
 90 | └── diffusion_pytorch_model.safetensors.index.json
 91 | flux_redux_bfl
 92 | ├── feature_extractor
 93 | │   └── preprocessor_config.json
 94 | ├── image_embedder
 95 | │   ├── config.json
 96 | │   └── diffusion_pytorch_model.safetensors
 97 | ├── image_encoder
 98 | │   ├── config.json
 99 | │   └── model.safetensors
100 | └── model_index.json
101 | HunyuanVideo
102 | ├── config.json
103 | ├── model_index.json
104 | ├── scheduler
105 | │   └── scheduler_config.json
106 | ├── text_encoder
107 | │   ├── config.json
108 | │   ├── model-00001-of-00004.safetensors
109 | │   ├── model-00002-of-00004.safetensors
110 | │   ├── model-00003-of-00004.safetensors
111 | │   ├── model-00004-of-00004.safetensors
112 | │   └── model.safetensors.index.json
113 | ├── text_encoder_2
114 | │   ├── config.json
115 | │   └── model.safetensors
116 | ├── tokenizer
117 | │   ├── special_tokens_map.json
118 | │   ├── tokenizer.json
119 | │   └── tokenizer_config.json
120 | ├── tokenizer_2
121 | │   ├── merges.txt
122 | │   ├── special_tokens_map.json
123 | │   ├── tokenizer_config.json
124 | │   └── vocab.json
125 | └── vae
126 |     ├── config.json
127 |     └── diffusion_pytorch_model.safetensors
128 | ```
129 | ![image](https://github.com/user-attachments/assets/7230b594-441f-45d9-bd0c-dedf7df11888)
130 | 
131 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .nodes import NODE_CLASS_MAPPINGS
2 | NODE_DISPLAY_NAME_MAPPINGS = {k:v.TITLE for k,v in NODE_CLASS_MAPPINGS.items()}
3 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']


--------------------------------------------------------------------------------
/diffusers_helper/bucket_tools.py:
--------------------------------------------------------------------------------
 1 | bucket_options = {
 2 |     640: [
 3 |         (416, 960),
 4 |         (448, 864),
 5 |         (480, 832),
 6 |         (512, 768),
 7 |         (544, 704),
 8 |         (576, 672),
 9 |         (608, 640),
10 |         (640, 608),
11 |         (672, 576),
12 |         (704, 544),
13 |         (768, 512),
14 |         (832, 480),
15 |         (864, 448),
16 |         (960, 416),
17 |     ],
18 | }
19 | 
20 | 
21 | def find_nearest_bucket(h, w, resolution=640):
22 |     min_metric = float('inf')
23 |     best_bucket = None
24 |     for (bucket_h, bucket_w) in bucket_options[resolution]:
25 |         metric = abs(h * bucket_w - w * bucket_h)
26 |         if metric <= min_metric:
27 |             min_metric = metric
28 |             best_bucket = (bucket_h, bucket_w)
29 |     return best_bucket
30 | 
31 | 


--------------------------------------------------------------------------------
/diffusers_helper/clip_vision.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def hf_clip_vision_encode(image, feature_extractor, image_encoder):
 5 |     assert isinstance(image, np.ndarray)
 6 |     assert image.ndim == 3 and image.shape[2] == 3
 7 |     assert image.dtype == np.uint8
 8 | 
 9 |     preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(device=image_encoder.device, dtype=image_encoder.dtype)
10 |     image_encoder_output = image_encoder(**preprocessed)
11 | 
12 |     return image_encoder_output
13 | 


--------------------------------------------------------------------------------
/diffusers_helper/dit_common.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import accelerate.accelerator
 3 | 
 4 | from diffusers.models.normalization import RMSNorm, LayerNorm, FP32LayerNorm, AdaLayerNormContinuous
 5 | 
 6 | 
 7 | accelerate.accelerator.convert_outputs_to_fp32 = lambda x: x
 8 | 
 9 | 
10 | def LayerNorm_forward(self, x):
11 |     return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
12 | 
13 | 
14 | LayerNorm.forward = LayerNorm_forward
15 | torch.nn.LayerNorm.forward = LayerNorm_forward
16 | 
17 | 
18 | def FP32LayerNorm_forward(self, x):
19 |     origin_dtype = x.dtype
20 |     return torch.nn.functional.layer_norm(
21 |         x.float(),
22 |         self.normalized_shape,
23 |         self.weight.float() if self.weight is not None else None,
24 |         self.bias.float() if self.bias is not None else None,
25 |         self.eps,
26 |     ).to(origin_dtype)
27 | 
28 | 
29 | FP32LayerNorm.forward = FP32LayerNorm_forward
30 | 
31 | 
32 | def RMSNorm_forward(self, hidden_states):
33 |     input_dtype = hidden_states.dtype
34 |     variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
35 |     hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
36 | 
37 |     if self.weight is None:
38 |         return hidden_states.to(input_dtype)
39 | 
40 |     return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
41 | 
42 | 
43 | RMSNorm.forward = RMSNorm_forward
44 | 
45 | 
46 | def AdaLayerNormContinuous_forward(self, x, conditioning_embedding):
47 |     emb = self.linear(self.silu(conditioning_embedding))
48 |     scale, shift = emb.chunk(2, dim=1)
49 |     x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
50 |     return x
51 | 
52 | 
53 | AdaLayerNormContinuous.forward = AdaLayerNormContinuous_forward
54 | 


--------------------------------------------------------------------------------
/diffusers_helper/gradio/__pycache__/progress_bar.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/gradio/__pycache__/progress_bar.cpython-310.pyc


--------------------------------------------------------------------------------
/diffusers_helper/gradio/progress_bar.py:
--------------------------------------------------------------------------------
 1 | progress_html = '''
 2 | <div class="loader-container">
 3 |   <div class="loader"></div>
 4 |   <div class="progress-container">
 5 |     <progress value="*number*" max="100"></progress>
 6 |   </div>
 7 |   <span>*text*</span>
 8 | </div>
 9 | '''
10 | 
11 | css = '''
12 | .loader-container {
13 |   display: flex; /* Use flex to align items horizontally */
14 |   align-items: center; /* Center items vertically within the container */
15 |   white-space: nowrap; /* Prevent line breaks within the container */
16 | }
17 | 
18 | .loader {
19 |   border: 8px solid #f3f3f3; /* Light grey */
20 |   border-top: 8px solid #3498db; /* Blue */
21 |   border-radius: 50%;
22 |   width: 30px;
23 |   height: 30px;
24 |   animation: spin 2s linear infinite;
25 | }
26 | 
27 | @keyframes spin {
28 |   0% { transform: rotate(0deg); }
29 |   100% { transform: rotate(360deg); }
30 | }
31 | 
32 | /* Style the progress bar */
33 | progress {
34 |   appearance: none; /* Remove default styling */
35 |   height: 20px; /* Set the height of the progress bar */
36 |   border-radius: 5px; /* Round the corners of the progress bar */
37 |   background-color: #f3f3f3; /* Light grey background */
38 |   width: 100%;
39 |   vertical-align: middle !important;
40 | }
41 | 
42 | /* Style the progress bar container */
43 | .progress-container {
44 |   margin-left: 20px;
45 |   margin-right: 20px;
46 |   flex-grow: 1; /* Allow the progress container to take up remaining space */
47 | }
48 | 
49 | /* Set the color of the progress bar fill */
50 | progress::-webkit-progress-value {
51 |   background-color: #3498db; /* Blue color for the fill */
52 | }
53 | 
54 | progress::-moz-progress-bar {
55 |   background-color: #3498db; /* Blue color for the fill in Firefox */
56 | }
57 | 
58 | /* Style the text on the progress bar */
59 | progress::after {
60 |   content: attr(value '%'); /* Display the progress value followed by '%' */
61 |   position: absolute;
62 |   top: 50%;
63 |   left: 50%;
64 |   transform: translate(-50%, -50%);
65 |   color: white; /* Set text color */
66 |   font-size: 14px; /* Set font size */
67 | }
68 | 
69 | /* Style other texts */
70 | .loader-container > span {
71 |   margin-left: 5px; /* Add spacing between the progress bar and the text */
72 | }
73 | 
74 | .no-generating-animation > .generating {
75 |   display: none !important;
76 | }
77 | 
78 | '''
79 | 
80 | 
81 | def make_progress_bar_html(number, text):
82 |     return progress_html.replace('*number*', str(number)).replace('*text*', text)
83 | 
84 | 
85 | def make_progress_bar_css():
86 |     return css
87 | 


--------------------------------------------------------------------------------
/diffusers_helper/hf_login.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def login(token):
 5 |     from huggingface_hub import login
 6 |     import time
 7 | 
 8 |     while True:
 9 |         try:
10 |             login(token)
11 |             print('HF login ok.')
12 |             break
13 |         except Exception as e:
14 |             print(f'HF login failed: {e}. Retrying')
15 |             time.sleep(0.5)
16 | 
17 | 
18 | hf_token = os.environ.get('HF_TOKEN', None)
19 | 
20 | if hf_token is not None:
21 |     login(hf_token)
22 | 


--------------------------------------------------------------------------------
/diffusers_helper/hunyuan.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
  4 | from diffusers_helper.utils import crop_or_pad_yield_mask
  5 | 
  6 | 
  7 | @torch.no_grad()
  8 | def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256):
  9 |     assert isinstance(prompt, str)
 10 | 
 11 |     prompt = [prompt]
 12 | 
 13 |     # LLAMA
 14 | 
 15 |     prompt_llama = [DEFAULT_PROMPT_TEMPLATE["template"].format(p) for p in prompt]
 16 |     crop_start = DEFAULT_PROMPT_TEMPLATE["crop_start"]
 17 | 
 18 |     llama_inputs = tokenizer(
 19 |         prompt_llama,
 20 |         padding="max_length",
 21 |         max_length=max_length + crop_start,
 22 |         truncation=True,
 23 |         return_tensors="pt",
 24 |         return_length=False,
 25 |         return_overflowing_tokens=False,
 26 |         return_attention_mask=True,
 27 |     )
 28 | 
 29 |     llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
 30 |     llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
 31 |     llama_attention_length = int(llama_attention_mask.sum())
 32 | 
 33 |     llama_outputs = text_encoder(
 34 |         input_ids=llama_input_ids,
 35 |         attention_mask=llama_attention_mask,
 36 |         output_hidden_states=True,
 37 |     )
 38 | 
 39 |     llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
 40 |     # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
 41 |     llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
 42 | 
 43 |     assert torch.all(llama_attention_mask.bool())
 44 | 
 45 |     # CLIP
 46 | 
 47 |     clip_l_input_ids = tokenizer_2(
 48 |         prompt,
 49 |         padding="max_length",
 50 |         max_length=77,
 51 |         truncation=True,
 52 |         return_overflowing_tokens=False,
 53 |         return_length=False,
 54 |         return_tensors="pt",
 55 |     ).input_ids
 56 |     clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
 57 | 
 58 |     return llama_vec, clip_l_pooler
 59 | 
 60 | 
 61 | @torch.no_grad()
 62 | def vae_decode_fake(latents):
 63 |     latent_rgb_factors = [
 64 |         [-0.0395, -0.0331, 0.0445],
 65 |         [0.0696, 0.0795, 0.0518],
 66 |         [0.0135, -0.0945, -0.0282],
 67 |         [0.0108, -0.0250, -0.0765],
 68 |         [-0.0209, 0.0032, 0.0224],
 69 |         [-0.0804, -0.0254, -0.0639],
 70 |         [-0.0991, 0.0271, -0.0669],
 71 |         [-0.0646, -0.0422, -0.0400],
 72 |         [-0.0696, -0.0595, -0.0894],
 73 |         [-0.0799, -0.0208, -0.0375],
 74 |         [0.1166, 0.1627, 0.0962],
 75 |         [0.1165, 0.0432, 0.0407],
 76 |         [-0.2315, -0.1920, -0.1355],
 77 |         [-0.0270, 0.0401, -0.0821],
 78 |         [-0.0616, -0.0997, -0.0727],
 79 |         [0.0249, -0.0469, -0.1703]
 80 |     ]  # From comfyui
 81 | 
 82 |     latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
 83 | 
 84 |     weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
 85 |     bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
 86 | 
 87 |     images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
 88 |     images = images.clamp(0.0, 1.0)
 89 | 
 90 |     return images
 91 | 
 92 | 
 93 | @torch.no_grad()
 94 | def vae_decode(latents, vae, image_mode=False):
 95 |     latents = latents / vae.config.scaling_factor
 96 | 
 97 |     if not image_mode:
 98 |         image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
 99 |     else:
100 |         latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
101 |         image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
102 |         image = torch.cat(image, dim=2)
103 | 
104 |     return image
105 | 
106 | 
107 | @torch.no_grad()
108 | def vae_encode(image, vae):
109 |     latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
110 |     latents = latents * vae.config.scaling_factor
111 |     return latents
112 | 


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/__pycache__/uni_pc_fm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/k_diffusion/__pycache__/uni_pc_fm.cpython-310.pyc


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/__pycache__/uni_pc_fm.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/k_diffusion/__pycache__/uni_pc_fm.cpython-312.pyc


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/__pycache__/wrapper.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/k_diffusion/__pycache__/wrapper.cpython-310.pyc


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/__pycache__/wrapper.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/k_diffusion/__pycache__/wrapper.cpython-312.pyc


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/uni_pc_fm.py:
--------------------------------------------------------------------------------
  1 | # Better Flow Matching UniPC by Lvmin Zhang
  2 | # (c) 2025
  3 | # CC BY-SA 4.0
  4 | # Attribution-ShareAlike 4.0 International Licence
  5 | 
  6 | 
  7 | import torch
  8 | 
  9 | from tqdm.auto import trange
 10 | 
 11 | 
 12 | def expand_dims(v, dims):
 13 |     return v[(...,) + (None,) * (dims - 1)]
 14 | 
 15 | 
 16 | class FlowMatchUniPC:
 17 |     def __init__(self, model, extra_args, variant='bh1'):
 18 |         self.model = model
 19 |         self.variant = variant
 20 |         self.extra_args = extra_args
 21 | 
 22 |     def model_fn(self, x, t):
 23 |         return self.model(x, t, **self.extra_args)
 24 | 
 25 |     def update_fn(self, x, model_prev_list, t_prev_list, t, order):
 26 |         assert order <= len(model_prev_list)
 27 |         dims = x.dim()
 28 | 
 29 |         t_prev_0 = t_prev_list[-1]
 30 |         lambda_prev_0 = - torch.log(t_prev_0)
 31 |         lambda_t = - torch.log(t)
 32 |         model_prev_0 = model_prev_list[-1]
 33 | 
 34 |         h = lambda_t - lambda_prev_0
 35 | 
 36 |         rks = []
 37 |         D1s = []
 38 |         for i in range(1, order):
 39 |             t_prev_i = t_prev_list[-(i + 1)]
 40 |             model_prev_i = model_prev_list[-(i + 1)]
 41 |             lambda_prev_i = - torch.log(t_prev_i)
 42 |             rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
 43 |             rks.append(rk)
 44 |             D1s.append((model_prev_i - model_prev_0) / rk)
 45 | 
 46 |         rks.append(1.)
 47 |         rks = torch.tensor(rks, device=x.device)
 48 | 
 49 |         R = []
 50 |         b = []
 51 | 
 52 |         hh = -h[0]
 53 |         h_phi_1 = torch.expm1(hh)
 54 |         h_phi_k = h_phi_1 / hh - 1
 55 | 
 56 |         factorial_i = 1
 57 | 
 58 |         if self.variant == 'bh1':
 59 |             B_h = hh
 60 |         elif self.variant == 'bh2':
 61 |             B_h = torch.expm1(hh)
 62 |         else:
 63 |             raise NotImplementedError('Bad variant!')
 64 | 
 65 |         for i in range(1, order + 1):
 66 |             R.append(torch.pow(rks, i - 1))
 67 |             b.append(h_phi_k * factorial_i / B_h)
 68 |             factorial_i *= (i + 1)
 69 |             h_phi_k = h_phi_k / hh - 1 / factorial_i
 70 | 
 71 |         R = torch.stack(R)
 72 |         b = torch.tensor(b, device=x.device)
 73 | 
 74 |         use_predictor = len(D1s) > 0
 75 | 
 76 |         if use_predictor:
 77 |             D1s = torch.stack(D1s, dim=1)
 78 |             if order == 2:
 79 |                 rhos_p = torch.tensor([0.5], device=b.device)
 80 |             else:
 81 |                 rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
 82 |         else:
 83 |             D1s = None
 84 |             rhos_p = None
 85 | 
 86 |         if order == 1:
 87 |             rhos_c = torch.tensor([0.5], device=b.device)
 88 |         else:
 89 |             rhos_c = torch.linalg.solve(R, b)
 90 | 
 91 |         x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
 92 | 
 93 |         if use_predictor:
 94 |             pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
 95 |         else:
 96 |             pred_res = 0
 97 | 
 98 |         x_t = x_t_ - expand_dims(B_h, dims) * pred_res
 99 |         model_t = self.model_fn(x_t, t)
100 | 
101 |         if D1s is not None:
102 |             corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
103 |         else:
104 |             corr_res = 0
105 | 
106 |         D1_t = (model_t - model_prev_0)
107 |         x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
108 | 
109 |         return x_t, model_t
110 | 
111 |     def sample(self, x, sigmas, callback=None, disable_pbar=False):
112 |         order = min(3, len(sigmas) - 2)
113 |         model_prev_list, t_prev_list = [], []
114 |         for i in trange(len(sigmas) - 1, disable=disable_pbar):
115 |             vec_t = sigmas[i].expand(x.shape[0])
116 | 
117 |             if i == 0:
118 |                 model_prev_list = [self.model_fn(x, vec_t)]
119 |                 t_prev_list = [vec_t]
120 |             elif i < order:
121 |                 init_order = i
122 |                 x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
123 |                 model_prev_list.append(model_x)
124 |                 t_prev_list.append(vec_t)
125 |             else:
126 |                 x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
127 |                 model_prev_list.append(model_x)
128 |                 t_prev_list.append(vec_t)
129 | 
130 |             model_prev_list = model_prev_list[-order:]
131 |             t_prev_list = t_prev_list[-order:]
132 | 
133 |             if callback is not None:
134 |                 callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
135 | 
136 |         return model_prev_list[-1]
137 | 
138 | 
139 | def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
140 |     assert variant in ['bh1', 'bh2']
141 |     return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)
142 | 


--------------------------------------------------------------------------------
/diffusers_helper/k_diffusion/wrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def append_dims(x, target_dims):
 5 |     return x[(...,) + (None,) * (target_dims - x.ndim)]
 6 | 
 7 | 
 8 | def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
 9 |     if guidance_rescale == 0:
10 |         return noise_cfg
11 | 
12 |     std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
13 |     std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
14 |     noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
15 |     noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
16 |     return noise_cfg
17 | 
18 | 
19 | def fm_wrapper(transformer, t_scale=1000.0):
20 |     def k_model(x, sigma, **extra_args):
21 |         dtype = extra_args['dtype']
22 |         cfg_scale = extra_args['cfg_scale']
23 |         cfg_rescale = extra_args['cfg_rescale']
24 |         concat_latent = extra_args['concat_latent']
25 | 
26 |         original_dtype = x.dtype
27 |         sigma = sigma.float()
28 | 
29 |         x = x.to(dtype)
30 |         timestep = (sigma * t_scale).to(dtype)
31 | 
32 |         if concat_latent is None:
33 |             hidden_states = x
34 |         else:
35 |             hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
36 | 
37 |         pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
38 | 
39 |         if cfg_scale == 1.0:
40 |             pred_negative = torch.zeros_like(pred_positive)
41 |         else:
42 |             pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
43 | 
44 |         pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
45 |         pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
46 | 
47 |         x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
48 | 
49 |         return x0.to(dtype=original_dtype)
50 | 
51 |     return k_model
52 | 


--------------------------------------------------------------------------------
/diffusers_helper/memory.py:
--------------------------------------------------------------------------------
  1 | # By lllyasviel
  2 | 
  3 | 
  4 | import torch
  5 | 
  6 | 
  7 | #cpu = torch.device('cpu')
  8 | #gpu = torch.device(f'cuda:{torch.cuda.current_device()}')
  9 | cpu = 'cpu'
 10 | gpu = 'cuda:0'
 11 | gpu_complete_modules = []
 12 | 
 13 | 
 14 | class DynamicSwapInstaller:
 15 |     @staticmethod
 16 |     def _install_module(module: torch.nn.Module, **kwargs):
 17 |         original_class = module.__class__
 18 |         module.__dict__['forge_backup_original_class'] = original_class
 19 | 
 20 |         def hacked_get_attr(self, name: str):
 21 |             if '_parameters' in self.__dict__:
 22 |                 _parameters = self.__dict__['_parameters']
 23 |                 if name in _parameters:
 24 |                     p = _parameters[name]
 25 |                     if p is None:
 26 |                         return None
 27 |                     if p.__class__ == torch.nn.Parameter:
 28 |                         return torch.nn.Parameter(p.to(**kwargs), requires_grad=p.requires_grad)
 29 |                     else:
 30 |                         return p.to(**kwargs)
 31 |             if '_buffers' in self.__dict__:
 32 |                 _buffers = self.__dict__['_buffers']
 33 |                 if name in _buffers:
 34 |                     return _buffers[name].to(**kwargs)
 35 |             return super(original_class, self).__getattr__(name)
 36 | 
 37 |         module.__class__ = type('DynamicSwap_' + original_class.__name__, (original_class,), {
 38 |             '__getattr__': hacked_get_attr,
 39 |         })
 40 | 
 41 |         return
 42 | 
 43 |     @staticmethod
 44 |     def _uninstall_module(module: torch.nn.Module):
 45 |         if 'forge_backup_original_class' in module.__dict__:
 46 |             module.__class__ = module.__dict__.pop('forge_backup_original_class')
 47 |         return
 48 | 
 49 |     @staticmethod
 50 |     def install_model(model: torch.nn.Module, **kwargs):
 51 |         for m in model.modules():
 52 |             DynamicSwapInstaller._install_module(m, **kwargs)
 53 |         return
 54 | 
 55 |     @staticmethod
 56 |     def uninstall_model(model: torch.nn.Module):
 57 |         for m in model.modules():
 58 |             DynamicSwapInstaller._uninstall_module(m)
 59 |         return
 60 | 
 61 | 
 62 | def fake_diffusers_current_device(model: torch.nn.Module, target_device: torch.device):
 63 |     if hasattr(model, 'scale_shift_table'):
 64 |         model.scale_shift_table.data = model.scale_shift_table.data.to(target_device)
 65 |         return
 66 | 
 67 |     for k, p in model.named_modules():
 68 |         if hasattr(p, 'weight'):
 69 |             p.to(target_device)
 70 |             return
 71 | 
 72 | 
 73 | def get_cuda_free_memory_gb(device=None):
 74 |     if device is None:
 75 |         device = gpu
 76 | 
 77 |     memory_stats = torch.cuda.memory_stats(device)
 78 |     bytes_active = memory_stats['active_bytes.all.current']
 79 |     bytes_reserved = memory_stats['reserved_bytes.all.current']
 80 |     bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
 81 |     bytes_inactive_reserved = bytes_reserved - bytes_active
 82 |     bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
 83 |     return bytes_total_available / (1024 ** 3)
 84 | 
 85 | 
 86 | def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0):
 87 |     print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB')
 88 | 
 89 |     for m in model.modules():
 90 |         if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
 91 |             torch.cuda.empty_cache()
 92 |             return
 93 | 
 94 |         if hasattr(m, 'weight'):
 95 |             m.to(device=target_device)
 96 | 
 97 |     model.to(device=target_device)
 98 |     torch.cuda.empty_cache()
 99 |     return
100 | 
101 | 
102 | def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0):
103 |     print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB')
104 | 
105 |     for m in model.modules():
106 |         if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
107 |             torch.cuda.empty_cache()
108 |             return
109 | 
110 |         if hasattr(m, 'weight'):
111 |             m.to(device=cpu)
112 | 
113 |     model.to(device=cpu)
114 |     torch.cuda.empty_cache()
115 |     return
116 | 
117 | 
118 | def unload_complete_models(*args):
119 |     for m in gpu_complete_modules + list(args):
120 |         m.to(device=cpu)
121 |         print(f'Unloaded {m.__class__.__name__} as complete.')
122 | 
123 |     gpu_complete_modules.clear()
124 |     torch.cuda.empty_cache()
125 |     return
126 | 
127 | 
128 | def load_model_as_complete(model, target_device, unload=True):
129 |     if unload:
130 |         unload_complete_models()
131 | 
132 |     model.to(device=target_device)
133 |     print(f'Loaded {model.__class__.__name__} to {target_device} as complete.')
134 | 
135 |     gpu_complete_modules.append(model)
136 |     return
137 | 


--------------------------------------------------------------------------------
/diffusers_helper/models/__pycache__/hunyuan_video_packed.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/models/__pycache__/hunyuan_video_packed.cpython-310.pyc


--------------------------------------------------------------------------------
/diffusers_helper/models/__pycache__/hunyuan_video_packed.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/models/__pycache__/hunyuan_video_packed.cpython-312.pyc


--------------------------------------------------------------------------------
/diffusers_helper/models/hunyuan_video_packed.py:
--------------------------------------------------------------------------------
   1 | from typing import Any, Dict, List, Optional, Tuple, Union
   2 | 
   3 | import torch
   4 | import einops
   5 | import torch.nn as nn
   6 | import numpy as np
   7 | 
   8 | from diffusers.loaders import FromOriginalModelMixin
   9 | from diffusers.configuration_utils import ConfigMixin, register_to_config
  10 | from diffusers.loaders import PeftAdapterMixin
  11 | from diffusers.utils import logging
  12 | from diffusers.models.attention import FeedForward
  13 | from diffusers.models.attention_processor import Attention
  14 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps, PixArtAlphaTextProjection
  15 | from diffusers.models.modeling_outputs import Transformer2DModelOutput
  16 | from diffusers.models.modeling_utils import ModelMixin
  17 | from diffusers_helper.dit_common import LayerNorm
  18 | from diffusers_helper.utils import zero_module
  19 | 
  20 | 
  21 | enabled_backends = []
  22 | 
  23 | if torch.backends.cuda.flash_sdp_enabled():
  24 |     enabled_backends.append("flash")
  25 | if torch.backends.cuda.math_sdp_enabled():
  26 |     enabled_backends.append("math")
  27 | if torch.backends.cuda.mem_efficient_sdp_enabled():
  28 |     enabled_backends.append("mem_efficient")
  29 | if torch.backends.cuda.cudnn_sdp_enabled():
  30 |     enabled_backends.append("cudnn")
  31 | 
  32 | print("Currently enabled native sdp backends:", enabled_backends)
  33 | 
  34 | try:
  35 |     # raise NotImplementedError
  36 |     from xformers.ops import memory_efficient_attention as xformers_attn_func
  37 |     print('Xformers is installed!')
  38 | except:
  39 |     print('Xformers is not installed!')
  40 |     xformers_attn_func = None
  41 | 
  42 | try:
  43 |     # raise NotImplementedError
  44 |     from flash_attn import flash_attn_varlen_func, flash_attn_func
  45 |     print('Flash Attn is installed!')
  46 | except:
  47 |     print('Flash Attn is not installed!')
  48 |     flash_attn_varlen_func = None
  49 |     flash_attn_func = None
  50 | 
  51 | try:
  52 |     # raise NotImplementedError
  53 |     from sageattention import sageattn_varlen, sageattn
  54 |     print('Sage Attn is installed!')
  55 | except:
  56 |     print('Sage Attn is not installed!')
  57 |     sageattn_varlen = None
  58 |     sageattn = None
  59 | 
  60 | 
  61 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
  62 | 
  63 | 
  64 | def pad_for_3d_conv(x, kernel_size):
  65 |     b, c, t, h, w = x.shape
  66 |     pt, ph, pw = kernel_size
  67 |     pad_t = (pt - (t % pt)) % pt
  68 |     pad_h = (ph - (h % ph)) % ph
  69 |     pad_w = (pw - (w % pw)) % pw
  70 |     return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode='replicate')
  71 | 
  72 | 
  73 | def center_down_sample_3d(x, kernel_size):
  74 |     # pt, ph, pw = kernel_size
  75 |     # cp = (pt * ph * pw) // 2
  76 |     # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
  77 |     # xc = xp[cp]
  78 |     # return xc
  79 |     return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
  80 | 
  81 | 
  82 | def get_cu_seqlens(text_mask, img_len):
  83 |     batch_size = text_mask.shape[0]
  84 |     text_len = text_mask.sum(dim=1)
  85 |     max_len = text_mask.shape[1] + img_len
  86 | 
  87 |     cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
  88 | 
  89 |     for i in range(batch_size):
  90 |         s = text_len[i] + img_len
  91 |         s1 = i * max_len + s
  92 |         s2 = (i + 1) * max_len
  93 |         cu_seqlens[2 * i + 1] = s1
  94 |         cu_seqlens[2 * i + 2] = s2
  95 | 
  96 |     return cu_seqlens
  97 | 
  98 | 
  99 | def apply_rotary_emb_transposed(x, freqs_cis):
 100 |     cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
 101 |     x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
 102 |     x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
 103 |     out = x.float() * cos + x_rotated.float() * sin
 104 |     out = out.to(x)
 105 |     return out
 106 | 
 107 | 
 108 | def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv):
 109 |     if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
 110 |         if sageattn is not None:
 111 |             x = sageattn(q, k, v, tensor_layout='NHD')
 112 |             return x
 113 | 
 114 |         if flash_attn_func is not None:
 115 |             x = flash_attn_func(q, k, v)
 116 |             return x
 117 | 
 118 |         if xformers_attn_func is not None:
 119 |             x = xformers_attn_func(q, k, v)
 120 |             return x
 121 | 
 122 |         x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
 123 |         return x
 124 | 
 125 |     batch_size = q.shape[0]
 126 |     q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
 127 |     k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
 128 |     v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
 129 |     if sageattn_varlen is not None:
 130 |         x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
 131 |     elif flash_attn_varlen_func is not None:
 132 |         x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
 133 |     else:
 134 |         raise NotImplementedError('No Attn Installed!')
 135 |     x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
 136 |     return x
 137 | 
 138 | 
 139 | class HunyuanAttnProcessorFlashAttnDouble:
 140 |     def __call__(self, attn, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb):
 141 |         cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
 142 | 
 143 |         query = attn.to_q(hidden_states)
 144 |         key = attn.to_k(hidden_states)
 145 |         value = attn.to_v(hidden_states)
 146 | 
 147 |         query = query.unflatten(2, (attn.heads, -1))
 148 |         key = key.unflatten(2, (attn.heads, -1))
 149 |         value = value.unflatten(2, (attn.heads, -1))
 150 | 
 151 |         query = attn.norm_q(query)
 152 |         key = attn.norm_k(key)
 153 | 
 154 |         query = apply_rotary_emb_transposed(query, image_rotary_emb)
 155 |         key = apply_rotary_emb_transposed(key, image_rotary_emb)
 156 | 
 157 |         encoder_query = attn.add_q_proj(encoder_hidden_states)
 158 |         encoder_key = attn.add_k_proj(encoder_hidden_states)
 159 |         encoder_value = attn.add_v_proj(encoder_hidden_states)
 160 | 
 161 |         encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
 162 |         encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
 163 |         encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
 164 | 
 165 |         encoder_query = attn.norm_added_q(encoder_query)
 166 |         encoder_key = attn.norm_added_k(encoder_key)
 167 | 
 168 |         query = torch.cat([query, encoder_query], dim=1)
 169 |         key = torch.cat([key, encoder_key], dim=1)
 170 |         value = torch.cat([value, encoder_value], dim=1)
 171 | 
 172 |         hidden_states = attn_varlen_func(query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
 173 |         hidden_states = hidden_states.flatten(-2)
 174 | 
 175 |         txt_length = encoder_hidden_states.shape[1]
 176 |         hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
 177 | 
 178 |         hidden_states = attn.to_out[0](hidden_states)
 179 |         hidden_states = attn.to_out[1](hidden_states)
 180 |         encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
 181 | 
 182 |         return hidden_states, encoder_hidden_states
 183 | 
 184 | 
 185 | class HunyuanAttnProcessorFlashAttnSingle:
 186 |     def __call__(self, attn, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb):
 187 |         cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
 188 | 
 189 |         hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
 190 | 
 191 |         query = attn.to_q(hidden_states)
 192 |         key = attn.to_k(hidden_states)
 193 |         value = attn.to_v(hidden_states)
 194 | 
 195 |         query = query.unflatten(2, (attn.heads, -1))
 196 |         key = key.unflatten(2, (attn.heads, -1))
 197 |         value = value.unflatten(2, (attn.heads, -1))
 198 | 
 199 |         query = attn.norm_q(query)
 200 |         key = attn.norm_k(key)
 201 | 
 202 |         txt_length = encoder_hidden_states.shape[1]
 203 | 
 204 |         query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
 205 |         key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
 206 | 
 207 |         hidden_states = attn_varlen_func(query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
 208 |         hidden_states = hidden_states.flatten(-2)
 209 | 
 210 |         hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
 211 | 
 212 |         return hidden_states, encoder_hidden_states
 213 | 
 214 | 
 215 | class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
 216 |     def __init__(self, embedding_dim, pooled_projection_dim):
 217 |         super().__init__()
 218 | 
 219 |         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
 220 |         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
 221 |         self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
 222 |         self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
 223 | 
 224 |     def forward(self, timestep, guidance, pooled_projection):
 225 |         timesteps_proj = self.time_proj(timestep)
 226 |         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
 227 | 
 228 |         guidance_proj = self.time_proj(guidance)
 229 |         guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
 230 | 
 231 |         time_guidance_emb = timesteps_emb + guidance_emb
 232 | 
 233 |         pooled_projections = self.text_embedder(pooled_projection)
 234 |         conditioning = time_guidance_emb + pooled_projections
 235 | 
 236 |         return conditioning
 237 | 
 238 | 
 239 | class CombinedTimestepTextProjEmbeddings(nn.Module):
 240 |     def __init__(self, embedding_dim, pooled_projection_dim):
 241 |         super().__init__()
 242 | 
 243 |         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
 244 |         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
 245 |         self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
 246 | 
 247 |     def forward(self, timestep, pooled_projection):
 248 |         timesteps_proj = self.time_proj(timestep)
 249 |         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
 250 | 
 251 |         pooled_projections = self.text_embedder(pooled_projection)
 252 | 
 253 |         conditioning = timesteps_emb + pooled_projections
 254 | 
 255 |         return conditioning
 256 | 
 257 | 
 258 | class HunyuanVideoAdaNorm(nn.Module):
 259 |     def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
 260 |         super().__init__()
 261 | 
 262 |         out_features = out_features or 2 * in_features
 263 |         self.linear = nn.Linear(in_features, out_features)
 264 |         self.nonlinearity = nn.SiLU()
 265 | 
 266 |     def forward(
 267 |         self, temb: torch.Tensor
 268 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 269 |         temb = self.linear(self.nonlinearity(temb))
 270 |         gate_msa, gate_mlp = temb.chunk(2, dim=-1)
 271 |         gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
 272 |         return gate_msa, gate_mlp
 273 | 
 274 | 
 275 | class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
 276 |     def __init__(
 277 |         self,
 278 |         num_attention_heads: int,
 279 |         attention_head_dim: int,
 280 |         mlp_width_ratio: str = 4.0,
 281 |         mlp_drop_rate: float = 0.0,
 282 |         attention_bias: bool = True,
 283 |     ) -> None:
 284 |         super().__init__()
 285 | 
 286 |         hidden_size = num_attention_heads * attention_head_dim
 287 | 
 288 |         self.norm1 = LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
 289 |         self.attn = Attention(
 290 |             query_dim=hidden_size,
 291 |             cross_attention_dim=None,
 292 |             heads=num_attention_heads,
 293 |             dim_head=attention_head_dim,
 294 |             bias=attention_bias,
 295 |         )
 296 | 
 297 |         self.norm2 = LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
 298 |         self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
 299 | 
 300 |         self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
 301 | 
 302 |     def forward(
 303 |         self,
 304 |         hidden_states: torch.Tensor,
 305 |         temb: torch.Tensor,
 306 |         attention_mask: Optional[torch.Tensor] = None,
 307 |     ) -> torch.Tensor:
 308 |         norm_hidden_states = self.norm1(hidden_states)
 309 | 
 310 |         attn_output = self.attn(
 311 |             hidden_states=norm_hidden_states,
 312 |             encoder_hidden_states=None,
 313 |             attention_mask=attention_mask,
 314 |         )
 315 | 
 316 |         gate_msa, gate_mlp = self.norm_out(temb)
 317 |         hidden_states = hidden_states + attn_output * gate_msa
 318 | 
 319 |         ff_output = self.ff(self.norm2(hidden_states))
 320 |         hidden_states = hidden_states + ff_output * gate_mlp
 321 | 
 322 |         return hidden_states
 323 | 
 324 | 
 325 | class HunyuanVideoIndividualTokenRefiner(nn.Module):
 326 |     def __init__(
 327 |         self,
 328 |         num_attention_heads: int,
 329 |         attention_head_dim: int,
 330 |         num_layers: int,
 331 |         mlp_width_ratio: float = 4.0,
 332 |         mlp_drop_rate: float = 0.0,
 333 |         attention_bias: bool = True,
 334 |     ) -> None:
 335 |         super().__init__()
 336 | 
 337 |         self.refiner_blocks = nn.ModuleList(
 338 |             [
 339 |                 HunyuanVideoIndividualTokenRefinerBlock(
 340 |                     num_attention_heads=num_attention_heads,
 341 |                     attention_head_dim=attention_head_dim,
 342 |                     mlp_width_ratio=mlp_width_ratio,
 343 |                     mlp_drop_rate=mlp_drop_rate,
 344 |                     attention_bias=attention_bias,
 345 |                 )
 346 |                 for _ in range(num_layers)
 347 |             ]
 348 |         )
 349 | 
 350 |     def forward(
 351 |         self,
 352 |         hidden_states: torch.Tensor,
 353 |         temb: torch.Tensor,
 354 |         attention_mask: Optional[torch.Tensor] = None,
 355 |     ) -> None:
 356 |         self_attn_mask = None
 357 |         if attention_mask is not None:
 358 |             batch_size = attention_mask.shape[0]
 359 |             seq_len = attention_mask.shape[1]
 360 |             attention_mask = attention_mask.to(hidden_states.device).bool()
 361 |             self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
 362 |             self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
 363 |             self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
 364 |             self_attn_mask[:, :, :, 0] = True
 365 | 
 366 |         for block in self.refiner_blocks:
 367 |             hidden_states = block(hidden_states, temb, self_attn_mask)
 368 | 
 369 |         return hidden_states
 370 | 
 371 | 
 372 | class HunyuanVideoTokenRefiner(nn.Module):
 373 |     def __init__(
 374 |         self,
 375 |         in_channels: int,
 376 |         num_attention_heads: int,
 377 |         attention_head_dim: int,
 378 |         num_layers: int,
 379 |         mlp_ratio: float = 4.0,
 380 |         mlp_drop_rate: float = 0.0,
 381 |         attention_bias: bool = True,
 382 |     ) -> None:
 383 |         super().__init__()
 384 | 
 385 |         hidden_size = num_attention_heads * attention_head_dim
 386 | 
 387 |         self.time_text_embed = CombinedTimestepTextProjEmbeddings(
 388 |             embedding_dim=hidden_size, pooled_projection_dim=in_channels
 389 |         )
 390 |         self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
 391 |         self.token_refiner = HunyuanVideoIndividualTokenRefiner(
 392 |             num_attention_heads=num_attention_heads,
 393 |             attention_head_dim=attention_head_dim,
 394 |             num_layers=num_layers,
 395 |             mlp_width_ratio=mlp_ratio,
 396 |             mlp_drop_rate=mlp_drop_rate,
 397 |             attention_bias=attention_bias,
 398 |         )
 399 | 
 400 |     def forward(
 401 |         self,
 402 |         hidden_states: torch.Tensor,
 403 |         timestep: torch.LongTensor,
 404 |         attention_mask: Optional[torch.LongTensor] = None,
 405 |     ) -> torch.Tensor:
 406 |         if attention_mask is None:
 407 |             pooled_projections = hidden_states.mean(dim=1)
 408 |         else:
 409 |             original_dtype = hidden_states.dtype
 410 |             mask_float = attention_mask.float().unsqueeze(-1)
 411 |             pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
 412 |             pooled_projections = pooled_projections.to(original_dtype)
 413 | 
 414 |         temb = self.time_text_embed(timestep, pooled_projections)
 415 |         hidden_states = self.proj_in(hidden_states)
 416 |         hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
 417 | 
 418 |         return hidden_states
 419 | 
 420 | 
 421 | class HunyuanVideoRotaryPosEmbed(nn.Module):
 422 |     def __init__(self, rope_dim, theta):
 423 |         super().__init__()
 424 |         self.DT, self.DY, self.DX = rope_dim
 425 |         self.theta = theta
 426 | 
 427 |     @torch.no_grad()
 428 |     def get_frequency(self, dim, pos):
 429 |         T, H, W = pos.shape
 430 |         freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
 431 |         freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
 432 |         return freqs.cos(), freqs.sin()
 433 | 
 434 |     @torch.no_grad()
 435 |     def forward_inner(self, frame_indices, height, width, device):
 436 |         GT, GY, GX = torch.meshgrid(
 437 |             frame_indices.to(device=device, dtype=torch.float32),
 438 |             torch.arange(0, height, device=device, dtype=torch.float32),
 439 |             torch.arange(0, width, device=device, dtype=torch.float32),
 440 |             indexing="ij"
 441 |         )
 442 | 
 443 |         FCT, FST = self.get_frequency(self.DT, GT)
 444 |         FCY, FSY = self.get_frequency(self.DY, GY)
 445 |         FCX, FSX = self.get_frequency(self.DX, GX)
 446 | 
 447 |         result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
 448 | 
 449 |         return result.to(device)
 450 | 
 451 |     @torch.no_grad()
 452 |     def forward(self, frame_indices, height, width, device):
 453 |         frame_indices = frame_indices.unbind(0)
 454 |         results = [self.forward_inner(f, height, width, device) for f in frame_indices]
 455 |         results = torch.stack(results, dim=0)
 456 |         return results
 457 | 
 458 | 
 459 | class AdaLayerNormZero(nn.Module):
 460 |     def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
 461 |         super().__init__()
 462 |         self.silu = nn.SiLU()
 463 |         self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
 464 |         if norm_type == "layer_norm":
 465 |             self.norm = LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
 466 |         else:
 467 |             raise ValueError(f"unknown norm_type {norm_type}")
 468 | 
 469 |     def forward(
 470 |         self,
 471 |         x: torch.Tensor,
 472 |         emb: Optional[torch.Tensor] = None,
 473 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 474 |         emb = emb.unsqueeze(-2)
 475 |         emb = self.linear(self.silu(emb))
 476 |         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
 477 |         x = self.norm(x) * (1 + scale_msa) + shift_msa
 478 |         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 479 | 
 480 | 
 481 | class AdaLayerNormZeroSingle(nn.Module):
 482 |     def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
 483 |         super().__init__()
 484 | 
 485 |         self.silu = nn.SiLU()
 486 |         self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
 487 |         if norm_type == "layer_norm":
 488 |             self.norm = LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
 489 |         else:
 490 |             raise ValueError(f"unknown norm_type {norm_type}")
 491 | 
 492 |     def forward(
 493 |         self,
 494 |         x: torch.Tensor,
 495 |         emb: Optional[torch.Tensor] = None,
 496 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 497 |         emb = emb.unsqueeze(-2)
 498 |         emb = self.linear(self.silu(emb))
 499 |         shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
 500 |         x = self.norm(x) * (1 + scale_msa) + shift_msa
 501 |         return x, gate_msa
 502 | 
 503 | 
 504 | class AdaLayerNormContinuous(nn.Module):
 505 |     def __init__(
 506 |         self,
 507 |         embedding_dim: int,
 508 |         conditioning_embedding_dim: int,
 509 |         elementwise_affine=True,
 510 |         eps=1e-5,
 511 |         bias=True,
 512 |         norm_type="layer_norm",
 513 |     ):
 514 |         super().__init__()
 515 |         self.silu = nn.SiLU()
 516 |         self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
 517 |         if norm_type == "layer_norm":
 518 |             self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
 519 |         else:
 520 |             raise ValueError(f"unknown norm_type {norm_type}")
 521 | 
 522 |     def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
 523 |         emb = emb.unsqueeze(-2)
 524 |         emb = self.linear(self.silu(emb))
 525 |         scale, shift = emb.chunk(2, dim=-1)
 526 |         x = self.norm(x) * (1 + scale) + shift
 527 |         return x
 528 | 
 529 | 
 530 | class HunyuanVideoSingleTransformerBlock(nn.Module):
 531 |     def __init__(
 532 |         self,
 533 |         num_attention_heads: int,
 534 |         attention_head_dim: int,
 535 |         mlp_ratio: float = 4.0,
 536 |         qk_norm: str = "rms_norm",
 537 |     ) -> None:
 538 |         super().__init__()
 539 | 
 540 |         hidden_size = num_attention_heads * attention_head_dim
 541 |         mlp_dim = int(hidden_size * mlp_ratio)
 542 | 
 543 |         self.attn = Attention(
 544 |             query_dim=hidden_size,
 545 |             cross_attention_dim=None,
 546 |             dim_head=attention_head_dim,
 547 |             heads=num_attention_heads,
 548 |             out_dim=hidden_size,
 549 |             bias=True,
 550 |             processor=HunyuanAttnProcessorFlashAttnSingle(),
 551 |             qk_norm=qk_norm,
 552 |             eps=1e-6,
 553 |             pre_only=True,
 554 |         )
 555 | 
 556 |         self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
 557 |         self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
 558 |         self.act_mlp = nn.GELU(approximate="tanh")
 559 |         self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
 560 | 
 561 |     def forward(
 562 |         self,
 563 |         hidden_states: torch.Tensor,
 564 |         encoder_hidden_states: torch.Tensor,
 565 |         temb: torch.Tensor,
 566 |         attention_mask: Optional[torch.Tensor] = None,
 567 |         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
 568 |     ) -> torch.Tensor:
 569 |         text_seq_length = encoder_hidden_states.shape[1]
 570 |         hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
 571 | 
 572 |         residual = hidden_states
 573 | 
 574 |         # 1. Input normalization
 575 |         norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
 576 |         mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
 577 | 
 578 |         norm_hidden_states, norm_encoder_hidden_states = (
 579 |             norm_hidden_states[:, :-text_seq_length, :],
 580 |             norm_hidden_states[:, -text_seq_length:, :],
 581 |         )
 582 | 
 583 |         # 2. Attention
 584 |         attn_output, context_attn_output = self.attn(
 585 |             hidden_states=norm_hidden_states,
 586 |             encoder_hidden_states=norm_encoder_hidden_states,
 587 |             attention_mask=attention_mask,
 588 |             image_rotary_emb=image_rotary_emb,
 589 |         )
 590 |         attn_output = torch.cat([attn_output, context_attn_output], dim=1)
 591 | 
 592 |         # 3. Modulation and residual connection
 593 |         hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
 594 |         hidden_states = gate * self.proj_out(hidden_states)
 595 |         hidden_states = hidden_states + residual
 596 | 
 597 |         hidden_states, encoder_hidden_states = (
 598 |             hidden_states[:, :-text_seq_length, :],
 599 |             hidden_states[:, -text_seq_length:, :],
 600 |         )
 601 |         return hidden_states, encoder_hidden_states
 602 | 
 603 | 
 604 | class HunyuanVideoTransformerBlock(nn.Module):
 605 |     def __init__(
 606 |         self,
 607 |         num_attention_heads: int,
 608 |         attention_head_dim: int,
 609 |         mlp_ratio: float,
 610 |         qk_norm: str = "rms_norm",
 611 |     ) -> None:
 612 |         super().__init__()
 613 | 
 614 |         hidden_size = num_attention_heads * attention_head_dim
 615 | 
 616 |         self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
 617 |         self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
 618 | 
 619 |         self.attn = Attention(
 620 |             query_dim=hidden_size,
 621 |             cross_attention_dim=None,
 622 |             added_kv_proj_dim=hidden_size,
 623 |             dim_head=attention_head_dim,
 624 |             heads=num_attention_heads,
 625 |             out_dim=hidden_size,
 626 |             context_pre_only=False,
 627 |             bias=True,
 628 |             processor=HunyuanAttnProcessorFlashAttnDouble(),
 629 |             qk_norm=qk_norm,
 630 |             eps=1e-6,
 631 |         )
 632 | 
 633 |         self.norm2 = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
 634 |         self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
 635 | 
 636 |         self.norm2_context = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
 637 |         self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
 638 | 
 639 |     def forward(
 640 |         self,
 641 |         hidden_states: torch.Tensor,
 642 |         encoder_hidden_states: torch.Tensor,
 643 |         temb: torch.Tensor,
 644 |         attention_mask: Optional[torch.Tensor] = None,
 645 |         freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
 646 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 647 |         # 1. Input normalization
 648 |         norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
 649 |         norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(encoder_hidden_states, emb=temb)
 650 | 
 651 |         # 2. Joint attention
 652 |         attn_output, context_attn_output = self.attn(
 653 |             hidden_states=norm_hidden_states,
 654 |             encoder_hidden_states=norm_encoder_hidden_states,
 655 |             attention_mask=attention_mask,
 656 |             image_rotary_emb=freqs_cis,
 657 |         )
 658 | 
 659 |         # 3. Modulation and residual connection
 660 |         hidden_states = hidden_states + attn_output * gate_msa
 661 |         encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
 662 | 
 663 |         norm_hidden_states = self.norm2(hidden_states)
 664 |         norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
 665 | 
 666 |         norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
 667 |         norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
 668 | 
 669 |         # 4. Feed-forward
 670 |         ff_output = self.ff(norm_hidden_states)
 671 |         context_ff_output = self.ff_context(norm_encoder_hidden_states)
 672 | 
 673 |         hidden_states = hidden_states + gate_mlp * ff_output
 674 |         encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
 675 | 
 676 |         return hidden_states, encoder_hidden_states
 677 | 
 678 | 
 679 | class ClipVisionProjection(nn.Module):
 680 |     def __init__(self, in_channels, out_channels):
 681 |         super().__init__()
 682 |         self.up = nn.Linear(in_channels, out_channels * 3)
 683 |         self.down = nn.Linear(out_channels * 3, out_channels)
 684 | 
 685 |     def forward(self, x):
 686 |         projected_x = self.down(nn.functional.silu(self.up(x)))
 687 |         return projected_x
 688 | 
 689 | 
 690 | class HunyuanVideoPatchEmbed(nn.Module):
 691 |     def __init__(self, patch_size, in_chans, embed_dim):
 692 |         super().__init__()
 693 |         self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
 694 | 
 695 | 
 696 | class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
 697 |     def __init__(self, inner_dim):
 698 |         super().__init__()
 699 |         self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
 700 |         self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
 701 |         self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
 702 | 
 703 |     @torch.no_grad()
 704 |     def initialize_weight_from_another_conv3d(self, another_layer):
 705 |         weight = another_layer.weight.detach().clone()
 706 |         bias = another_layer.bias.detach().clone()
 707 | 
 708 |         sd = {
 709 |             'proj.weight': weight.clone(),
 710 |             'proj.bias': bias.clone(),
 711 |             'proj_2x.weight': einops.repeat(weight, 'b c t h w -> b c (t tk) (h hk) (w wk)', tk=2, hk=2, wk=2) / 8.0,
 712 |             'proj_2x.bias': bias.clone(),
 713 |             'proj_4x.weight': einops.repeat(weight, 'b c t h w -> b c (t tk) (h hk) (w wk)', tk=4, hk=4, wk=4) / 64.0,
 714 |             'proj_4x.bias': bias.clone(),
 715 |         }
 716 | 
 717 |         sd = {k: v.clone() for k, v in sd.items()}
 718 | 
 719 |         self.load_state_dict(sd)
 720 |         return
 721 | 
 722 | 
 723 | class HunyuanVideoTransformer3DModelPacked(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
 724 |     @register_to_config
 725 |     def __init__(
 726 |         self,
 727 |         in_channels: int = 16,
 728 |         out_channels: int = 16,
 729 |         num_attention_heads: int = 24,
 730 |         attention_head_dim: int = 128,
 731 |         num_layers: int = 20,
 732 |         num_single_layers: int = 40,
 733 |         num_refiner_layers: int = 2,
 734 |         mlp_ratio: float = 4.0,
 735 |         patch_size: int = 2,
 736 |         patch_size_t: int = 1,
 737 |         qk_norm: str = "rms_norm",
 738 |         guidance_embeds: bool = True,
 739 |         text_embed_dim: int = 4096,
 740 |         pooled_projection_dim: int = 768,
 741 |         rope_theta: float = 256.0,
 742 |         rope_axes_dim: Tuple[int] = (16, 56, 56),
 743 |         has_image_proj=False,
 744 |         image_proj_dim=1152,
 745 |         has_clean_x_embedder=False,
 746 |     ) -> None:
 747 |         super().__init__()
 748 | 
 749 |         inner_dim = num_attention_heads * attention_head_dim
 750 |         out_channels = out_channels or in_channels
 751 | 
 752 |         # 1. Latent and condition embedders
 753 |         self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
 754 |         self.context_embedder = HunyuanVideoTokenRefiner(
 755 |             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
 756 |         )
 757 |         self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
 758 | 
 759 |         self.clean_x_embedder = None
 760 |         self.image_projection = None
 761 | 
 762 |         # 2. RoPE
 763 |         self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
 764 | 
 765 |         # 3. Dual stream transformer blocks
 766 |         self.transformer_blocks = nn.ModuleList(
 767 |             [
 768 |                 HunyuanVideoTransformerBlock(
 769 |                     num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
 770 |                 )
 771 |                 for _ in range(num_layers)
 772 |             ]
 773 |         )
 774 | 
 775 |         # 4. Single stream transformer blocks
 776 |         self.single_transformer_blocks = nn.ModuleList(
 777 |             [
 778 |                 HunyuanVideoSingleTransformerBlock(
 779 |                     num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
 780 |                 )
 781 |                 for _ in range(num_single_layers)
 782 |             ]
 783 |         )
 784 | 
 785 |         # 5. Output projection
 786 |         self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
 787 |         self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
 788 | 
 789 |         self.inner_dim = inner_dim
 790 |         self.use_gradient_checkpointing = False
 791 |         self.enable_teacache = False
 792 | 
 793 |         if has_image_proj:
 794 |             self.install_image_projection(image_proj_dim)
 795 | 
 796 |         if has_clean_x_embedder:
 797 |             self.install_clean_x_embedder()
 798 | 
 799 |         self.high_quality_fp32_output_for_inference = False
 800 | 
 801 |     def install_image_projection(self, in_channels):
 802 |         self.image_projection = ClipVisionProjection(in_channels=in_channels, out_channels=self.inner_dim)
 803 |         self.config['has_image_proj'] = True
 804 |         self.config['image_proj_dim'] = in_channels
 805 | 
 806 |     def install_clean_x_embedder(self):
 807 |         self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
 808 |         self.config['has_clean_x_embedder'] = True
 809 | 
 810 |     def enable_gradient_checkpointing(self):
 811 |         self.use_gradient_checkpointing = True
 812 |         print('self.use_gradient_checkpointing = True')
 813 | 
 814 |     def disable_gradient_checkpointing(self):
 815 |         self.use_gradient_checkpointing = False
 816 |         print('self.use_gradient_checkpointing = False')
 817 | 
 818 |     def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
 819 |         self.enable_teacache = enable_teacache
 820 |         self.cnt = 0
 821 |         self.num_steps = num_steps
 822 |         self.rel_l1_thresh = rel_l1_thresh  # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
 823 |         self.accumulated_rel_l1_distance = 0
 824 |         self.previous_modulated_input = None
 825 |         self.previous_residual = None
 826 |         self.teacache_rescale_func = np.poly1d([7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02])
 827 | 
 828 |     def gradient_checkpointing_method(self, block, *args):
 829 |         if self.use_gradient_checkpointing:
 830 |             result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
 831 |         else:
 832 |             result = block(*args)
 833 |         return result
 834 | 
 835 |     def process_input_hidden_states(
 836 |             self,
 837 |             latents, latent_indices=None,
 838 |             clean_latents=None, clean_latent_indices=None,
 839 |             clean_latents_2x=None, clean_latent_2x_indices=None,
 840 |             clean_latents_4x=None, clean_latent_4x_indices=None
 841 |     ):
 842 |         hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
 843 |         B, C, T, H, W = hidden_states.shape
 844 | 
 845 |         if latent_indices is None:
 846 |             latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
 847 | 
 848 |         hidden_states = hidden_states.flatten(2).transpose(1, 2)
 849 | 
 850 |         rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
 851 |         rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
 852 | 
 853 |         if clean_latents is not None and clean_latent_indices is not None:
 854 |             clean_latents = clean_latents.to(hidden_states)
 855 |             clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
 856 |             clean_latents = clean_latents.flatten(2).transpose(1, 2)
 857 | 
 858 |             clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
 859 |             clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
 860 | 
 861 |             hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
 862 |             rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
 863 | 
 864 |         if clean_latents_2x is not None and clean_latent_2x_indices is not None:
 865 |             clean_latents_2x = clean_latents_2x.to(hidden_states)
 866 |             clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
 867 |             clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
 868 |             clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
 869 | 
 870 |             clean_latent_2x_rope_freqs = self.rope(frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device)
 871 |             clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
 872 |             clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
 873 |             clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
 874 | 
 875 |             hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
 876 |             rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
 877 | 
 878 |         if clean_latents_4x is not None and clean_latent_4x_indices is not None:
 879 |             clean_latents_4x = clean_latents_4x.to(hidden_states)
 880 |             clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
 881 |             clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
 882 |             clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
 883 | 
 884 |             clean_latent_4x_rope_freqs = self.rope(frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device)
 885 |             clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
 886 |             clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
 887 |             clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
 888 | 
 889 |             hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
 890 |             rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
 891 | 
 892 |         return hidden_states, rope_freqs
 893 | 
 894 |     def forward(
 895 |             self,
 896 |             hidden_states, timestep, encoder_hidden_states, encoder_attention_mask, pooled_projections, guidance,
 897 |             latent_indices=None,
 898 |             clean_latents=None, clean_latent_indices=None,
 899 |             clean_latents_2x=None, clean_latent_2x_indices=None,
 900 |             clean_latents_4x=None, clean_latent_4x_indices=None,
 901 |             image_embeddings=None,
 902 |             attention_kwargs=None, return_dict=True
 903 |     ):
 904 | 
 905 |         if attention_kwargs is None:
 906 |             attention_kwargs = {}
 907 | 
 908 |         batch_size, num_channels, num_frames, height, width = hidden_states.shape
 909 |         p, p_t = self.config['patch_size'], self.config['patch_size_t']
 910 |         post_patch_num_frames = num_frames // p_t
 911 |         post_patch_height = height // p
 912 |         post_patch_width = width // p
 913 |         original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
 914 | 
 915 |         hidden_states, rope_freqs = self.process_input_hidden_states(hidden_states, latent_indices, clean_latents, clean_latent_indices, clean_latents_2x, clean_latent_2x_indices, clean_latents_4x, clean_latent_4x_indices)
 916 | 
 917 |         temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
 918 |         encoder_hidden_states = self.gradient_checkpointing_method(self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask)
 919 | 
 920 |         if self.image_projection is not None:
 921 |             assert image_embeddings is not None, 'You must use image embeddings!'
 922 |             extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
 923 |             extra_attention_mask = torch.ones((batch_size, extra_encoder_hidden_states.shape[1]), dtype=encoder_attention_mask.dtype, device=encoder_attention_mask.device)
 924 | 
 925 |             # must cat before (not after) encoder_hidden_states, due to attn masking
 926 |             encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
 927 |             encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
 928 | 
 929 |         with torch.no_grad():
 930 |             if batch_size == 1:
 931 |                 # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
 932 |                 # If they are not same, then their impls are wrong. Ours are always the correct one.
 933 |                 text_len = encoder_attention_mask.sum().item()
 934 |                 encoder_hidden_states = encoder_hidden_states[:, :text_len]
 935 |                 attention_mask = None, None, None, None
 936 |             else:
 937 |                 img_seq_len = hidden_states.shape[1]
 938 |                 txt_seq_len = encoder_hidden_states.shape[1]
 939 | 
 940 |                 cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
 941 |                 cu_seqlens_kv = cu_seqlens_q
 942 |                 max_seqlen_q = img_seq_len + txt_seq_len
 943 |                 max_seqlen_kv = max_seqlen_q
 944 | 
 945 |                 attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
 946 | 
 947 |         if self.enable_teacache:
 948 |             modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
 949 | 
 950 |             if self.cnt == 0 or self.cnt == self.num_steps-1:
 951 |                 should_calc = True
 952 |                 self.accumulated_rel_l1_distance = 0
 953 |             else:
 954 |                 curr_rel_l1 = ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item()
 955 |                 self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
 956 |                 should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
 957 | 
 958 |                 if should_calc:
 959 |                     self.accumulated_rel_l1_distance = 0
 960 | 
 961 |             self.previous_modulated_input = modulated_inp
 962 |             self.cnt += 1
 963 | 
 964 |             if self.cnt == self.num_steps:
 965 |                 self.cnt = 0
 966 | 
 967 |             if not should_calc:
 968 |                 hidden_states = hidden_states + self.previous_residual
 969 |             else:
 970 |                 ori_hidden_states = hidden_states.clone()
 971 | 
 972 |                 for block_id, block in enumerate(self.transformer_blocks):
 973 |                     hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
 974 |                         block,
 975 |                         hidden_states,
 976 |                         encoder_hidden_states,
 977 |                         temb,
 978 |                         attention_mask,
 979 |                         rope_freqs
 980 |                     )
 981 | 
 982 |                 for block_id, block in enumerate(self.single_transformer_blocks):
 983 |                     hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
 984 |                         block,
 985 |                         hidden_states,
 986 |                         encoder_hidden_states,
 987 |                         temb,
 988 |                         attention_mask,
 989 |                         rope_freqs
 990 |                     )
 991 | 
 992 |                 self.previous_residual = hidden_states - ori_hidden_states
 993 |         else:
 994 |             for block_id, block in enumerate(self.transformer_blocks):
 995 |                 hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
 996 |                     block,
 997 |                     hidden_states,
 998 |                     encoder_hidden_states,
 999 |                     temb,
1000 |                     attention_mask,
1001 |                     rope_freqs
1002 |                 )
1003 | 
1004 |             for block_id, block in enumerate(self.single_transformer_blocks):
1005 |                 hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
1006 |                     block,
1007 |                     hidden_states,
1008 |                     encoder_hidden_states,
1009 |                     temb,
1010 |                     attention_mask,
1011 |                     rope_freqs
1012 |                 )
1013 | 
1014 |         hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
1015 | 
1016 |         hidden_states = hidden_states[:, -original_context_length:, :]
1017 | 
1018 |         if self.high_quality_fp32_output_for_inference:
1019 |             hidden_states = hidden_states.to(dtype=torch.float32)
1020 |             if self.proj_out.weight.dtype != torch.float32:
1021 |                 self.proj_out.to(dtype=torch.float32)
1022 | 
1023 |         hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
1024 | 
1025 |         hidden_states = einops.rearrange(hidden_states, 'b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)',
1026 |                                          t=post_patch_num_frames, h=post_patch_height, w=post_patch_width,
1027 |                                          pt=p_t, ph=p, pw=p)
1028 | 
1029 |         if return_dict:
1030 |             return Transformer2DModelOutput(sample=hidden_states)
1031 | 
1032 |         return hidden_states,
1033 | 


--------------------------------------------------------------------------------
/diffusers_helper/pipelines/__pycache__/k_diffusion_hunyuan.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/pipelines/__pycache__/k_diffusion_hunyuan.cpython-310.pyc


--------------------------------------------------------------------------------
/diffusers_helper/pipelines/__pycache__/k_diffusion_hunyuan.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HM-RunningHub/ComfyUI_RH_FramePack/ad48591fcdf6af01489305a339e25fe6fcff8869/diffusers_helper/pipelines/__pycache__/k_diffusion_hunyuan.cpython-312.pyc


--------------------------------------------------------------------------------
/diffusers_helper/pipelines/k_diffusion_hunyuan.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | 
  4 | from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
  5 | from diffusers_helper.k_diffusion.wrapper import fm_wrapper
  6 | from diffusers_helper.utils import repeat_to_batch_size
  7 | 
  8 | 
  9 | def flux_time_shift(t, mu=1.15, sigma=1.0):
 10 |     return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 11 | 
 12 | 
 13 | def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
 14 |     k = (y2 - y1) / (x2 - x1)
 15 |     b = y1 - k * x1
 16 |     mu = k * context_length + b
 17 |     mu = min(mu, math.log(exp_max))
 18 |     return mu
 19 | 
 20 | 
 21 | def get_flux_sigmas_from_mu(n, mu):
 22 |     sigmas = torch.linspace(1, 0, steps=n + 1)
 23 |     sigmas = flux_time_shift(sigmas, mu=mu)
 24 |     return sigmas
 25 | 
 26 | 
 27 | @torch.inference_mode()
 28 | def sample_hunyuan(
 29 |         transformer,
 30 |         sampler='unipc',
 31 |         initial_latent=None,
 32 |         concat_latent=None,
 33 |         strength=1.0,
 34 |         width=512,
 35 |         height=512,
 36 |         frames=16,
 37 |         real_guidance_scale=1.0,
 38 |         distilled_guidance_scale=6.0,
 39 |         guidance_rescale=0.0,
 40 |         shift=None,
 41 |         num_inference_steps=25,
 42 |         batch_size=None,
 43 |         generator=None,
 44 |         prompt_embeds=None,
 45 |         prompt_embeds_mask=None,
 46 |         prompt_poolers=None,
 47 |         negative_prompt_embeds=None,
 48 |         negative_prompt_embeds_mask=None,
 49 |         negative_prompt_poolers=None,
 50 |         dtype=torch.bfloat16,
 51 |         device=None,
 52 |         negative_kwargs=None,
 53 |         callback=None,
 54 |         **kwargs,
 55 | ):
 56 |     device = device or transformer.device
 57 | 
 58 |     if batch_size is None:
 59 |         batch_size = int(prompt_embeds.shape[0])
 60 | 
 61 |     latents = torch.randn((batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device).to(device=device, dtype=torch.float32)
 62 | 
 63 |     B, C, T, H, W = latents.shape
 64 |     seq_length = T * H * W // 4
 65 | 
 66 |     if shift is None:
 67 |         mu = calculate_flux_mu(seq_length, exp_max=7.0)
 68 |     else:
 69 |         mu = math.log(shift)
 70 | 
 71 |     sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
 72 | 
 73 |     k_model = fm_wrapper(transformer)
 74 | 
 75 |     if initial_latent is not None:
 76 |         sigmas = sigmas * strength
 77 |         first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
 78 |         initial_latent = initial_latent.to(device=device, dtype=torch.float32)
 79 |         latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
 80 | 
 81 |     if concat_latent is not None:
 82 |         concat_latent = concat_latent.to(latents)
 83 | 
 84 |     distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
 85 | 
 86 |     prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
 87 |     prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
 88 |     prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
 89 |     negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
 90 |     negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
 91 |     negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
 92 |     concat_latent = repeat_to_batch_size(concat_latent, batch_size)
 93 | 
 94 |     sampler_kwargs = dict(
 95 |         dtype=dtype,
 96 |         cfg_scale=real_guidance_scale,
 97 |         cfg_rescale=guidance_rescale,
 98 |         concat_latent=concat_latent,
 99 |         positive=dict(
100 |             pooled_projections=prompt_poolers,
101 |             encoder_hidden_states=prompt_embeds,
102 |             encoder_attention_mask=prompt_embeds_mask,
103 |             guidance=distilled_guidance,
104 |             **kwargs,
105 |         ),
106 |         negative=dict(
107 |             pooled_projections=negative_prompt_poolers,
108 |             encoder_hidden_states=negative_prompt_embeds,
109 |             encoder_attention_mask=negative_prompt_embeds_mask,
110 |             guidance=distilled_guidance,
111 |             **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
112 |         )
113 |     )
114 | 
115 |     if sampler == 'unipc':
116 |         results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
117 |     else:
118 |         raise NotImplementedError(f'Sampler {sampler} is not supported.')
119 | 
120 |     return results
121 | 


--------------------------------------------------------------------------------
/diffusers_helper/thread_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from threading import Thread, Lock
 4 | 
 5 | 
 6 | class Listener:
 7 |     task_queue = []
 8 |     lock = Lock()
 9 |     thread = None
10 |     
11 |     @classmethod
12 |     def _process_tasks(cls):
13 |         while True:
14 |             task = None
15 |             with cls.lock:
16 |                 if cls.task_queue:
17 |                     task = cls.task_queue.pop(0)
18 |                     
19 |             if task is None:
20 |                 time.sleep(0.001)
21 |                 continue
22 |                 
23 |             func, args, kwargs = task
24 |             try:
25 |                 func(*args, **kwargs)
26 |             except Exception as e:
27 |                 print(f"Error in listener thread: {e}")
28 |     
29 |     @classmethod
30 |     def add_task(cls, func, *args, **kwargs):
31 |         with cls.lock:
32 |             cls.task_queue.append((func, args, kwargs))
33 | 
34 |         if cls.thread is None:
35 |             cls.thread = Thread(target=cls._process_tasks, daemon=True)
36 |             cls.thread.start()
37 | 
38 | 
39 | def async_run(func, *args, **kwargs):
40 |     Listener.add_task(func, *args, **kwargs)
41 | 
42 | 
43 | class FIFOQueue:
44 |     def __init__(self):
45 |         self.queue = []
46 |         self.lock = Lock()
47 | 
48 |     def push(self, item):
49 |         with self.lock:
50 |             self.queue.append(item)
51 | 
52 |     def pop(self):
53 |         with self.lock:
54 |             if self.queue:
55 |                 return self.queue.pop(0)
56 |             return None
57 | 
58 |     def top(self):
59 |         with self.lock:
60 |             if self.queue:
61 |                 return self.queue[0]
62 |             return None
63 | 
64 |     def next(self):
65 |         while True:
66 |             with self.lock:
67 |                 if self.queue:
68 |                     return self.queue.pop(0)
69 | 
70 |             time.sleep(0.001)
71 | 
72 | 
73 | class AsyncStream:
74 |     def __init__(self):
75 |         self.input_queue = FIFOQueue()
76 |         self.output_queue = FIFOQueue()
77 | 


--------------------------------------------------------------------------------
/diffusers_helper/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import json
  4 | import random
  5 | import glob
  6 | import torch
  7 | import einops
  8 | import numpy as np
  9 | import datetime
 10 | import torchvision
 11 | 
 12 | import safetensors.torch as sf
 13 | from PIL import Image
 14 | 
 15 | 
 16 | def min_resize(x, m):
 17 |     if x.shape[0] < x.shape[1]:
 18 |         s0 = m
 19 |         s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
 20 |     else:
 21 |         s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
 22 |         s1 = m
 23 |     new_max = max(s1, s0)
 24 |     raw_max = max(x.shape[0], x.shape[1])
 25 |     if new_max < raw_max:
 26 |         interpolation = cv2.INTER_AREA
 27 |     else:
 28 |         interpolation = cv2.INTER_LANCZOS4
 29 |     y = cv2.resize(x, (s1, s0), interpolation=interpolation)
 30 |     return y
 31 | 
 32 | 
 33 | def d_resize(x, y):
 34 |     H, W, C = y.shape
 35 |     new_min = min(H, W)
 36 |     raw_min = min(x.shape[0], x.shape[1])
 37 |     if new_min < raw_min:
 38 |         interpolation = cv2.INTER_AREA
 39 |     else:
 40 |         interpolation = cv2.INTER_LANCZOS4
 41 |     y = cv2.resize(x, (W, H), interpolation=interpolation)
 42 |     return y
 43 | 
 44 | 
 45 | def resize_and_center_crop(image, target_width, target_height):
 46 |     if target_height == image.shape[0] and target_width == image.shape[1]:
 47 |         return image
 48 | 
 49 |     pil_image = Image.fromarray(image)
 50 |     original_width, original_height = pil_image.size
 51 |     scale_factor = max(target_width / original_width, target_height / original_height)
 52 |     resized_width = int(round(original_width * scale_factor))
 53 |     resized_height = int(round(original_height * scale_factor))
 54 |     resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
 55 |     left = (resized_width - target_width) / 2
 56 |     top = (resized_height - target_height) / 2
 57 |     right = (resized_width + target_width) / 2
 58 |     bottom = (resized_height + target_height) / 2
 59 |     cropped_image = resized_image.crop((left, top, right, bottom))
 60 |     return np.array(cropped_image)
 61 | 
 62 | 
 63 | def resize_and_center_crop_pytorch(image, target_width, target_height):
 64 |     B, C, H, W = image.shape
 65 | 
 66 |     if H == target_height and W == target_width:
 67 |         return image
 68 | 
 69 |     scale_factor = max(target_width / W, target_height / H)
 70 |     resized_width = int(round(W * scale_factor))
 71 |     resized_height = int(round(H * scale_factor))
 72 | 
 73 |     resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode='bilinear', align_corners=False)
 74 | 
 75 |     top = (resized_height - target_height) // 2
 76 |     left = (resized_width - target_width) // 2
 77 |     cropped = resized[:, :, top:top + target_height, left:left + target_width]
 78 | 
 79 |     return cropped
 80 | 
 81 | 
 82 | def resize_without_crop(image, target_width, target_height):
 83 |     if target_height == image.shape[0] and target_width == image.shape[1]:
 84 |         return image
 85 | 
 86 |     pil_image = Image.fromarray(image)
 87 |     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
 88 |     return np.array(resized_image)
 89 | 
 90 | 
 91 | def just_crop(image, w, h):
 92 |     if h == image.shape[0] and w == image.shape[1]:
 93 |         return image
 94 | 
 95 |     original_height, original_width = image.shape[:2]
 96 |     k = min(original_height / h, original_width / w)
 97 |     new_width = int(round(w * k))
 98 |     new_height = int(round(h * k))
 99 |     x_start = (original_width - new_width) // 2
100 |     y_start = (original_height - new_height) // 2
101 |     cropped_image = image[y_start:y_start + new_height, x_start:x_start + new_width]
102 |     return cropped_image
103 | 
104 | 
105 | def write_to_json(data, file_path):
106 |     temp_file_path = file_path + ".tmp"
107 |     with open(temp_file_path, 'wt', encoding='utf-8') as temp_file:
108 |         json.dump(data, temp_file, indent=4)
109 |     os.replace(temp_file_path, file_path)
110 |     return
111 | 
112 | 
113 | def read_from_json(file_path):
114 |     with open(file_path, 'rt', encoding='utf-8') as file:
115 |         data = json.load(file)
116 |     return data
117 | 
118 | 
119 | def get_active_parameters(m):
120 |     return {k: v for k, v in m.named_parameters() if v.requires_grad}
121 | 
122 | 
123 | def cast_training_params(m, dtype=torch.float32):
124 |     result = {}
125 |     for n, param in m.named_parameters():
126 |         if param.requires_grad:
127 |             param.data = param.to(dtype)
128 |             result[n] = param
129 |     return result
130 | 
131 | 
132 | def separate_lora_AB(parameters, B_patterns=None):
133 |     parameters_normal = {}
134 |     parameters_B = {}
135 | 
136 |     if B_patterns is None:
137 |         B_patterns = ['.lora_B.', '__zero__']
138 | 
139 |     for k, v in parameters.items():
140 |         if any(B_pattern in k for B_pattern in B_patterns):
141 |             parameters_B[k] = v
142 |         else:
143 |             parameters_normal[k] = v
144 | 
145 |     return parameters_normal, parameters_B
146 | 
147 | 
148 | def set_attr_recursive(obj, attr, value):
149 |     attrs = attr.split(".")
150 |     for name in attrs[:-1]:
151 |         obj = getattr(obj, name)
152 |     setattr(obj, attrs[-1], value)
153 |     return
154 | 
155 | 
156 | def print_tensor_list_size(tensors):
157 |     total_size = 0
158 |     total_elements = 0
159 | 
160 |     if isinstance(tensors, dict):
161 |         tensors = tensors.values()
162 | 
163 |     for tensor in tensors:
164 |         total_size += tensor.nelement() * tensor.element_size()
165 |         total_elements += tensor.nelement()
166 | 
167 |     total_size_MB = total_size / (1024 ** 2)
168 |     total_elements_B = total_elements / 1e9
169 | 
170 |     print(f"Total number of tensors: {len(tensors)}")
171 |     print(f"Total size of tensors: {total_size_MB:.2f} MB")
172 |     print(f"Total number of parameters: {total_elements_B:.3f} billion")
173 |     return
174 | 
175 | 
176 | @torch.no_grad()
177 | def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
178 |     batch_size = a.size(0)
179 | 
180 |     if b is None:
181 |         b = torch.zeros_like(a)
182 | 
183 |     if mask_a is None:
184 |         mask_a = torch.rand(batch_size) < probability_a
185 | 
186 |     mask_a = mask_a.to(a.device)
187 |     mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
188 |     result = torch.where(mask_a, a, b)
189 |     return result
190 | 
191 | 
192 | @torch.no_grad()
193 | def zero_module(module):
194 |     for p in module.parameters():
195 |         p.detach().zero_()
196 |     return module
197 | 
198 | 
199 | @torch.no_grad()
200 | def supress_lower_channels(m, k, alpha=0.01):
201 |     data = m.weight.data.clone()
202 | 
203 |     assert int(data.shape[1]) >= k
204 | 
205 |     data[:, :k] = data[:, :k] * alpha
206 |     m.weight.data = data.contiguous().clone()
207 |     return m
208 | 
209 | 
210 | def freeze_module(m):
211 |     if not hasattr(m, '_forward_inside_frozen_module'):
212 |         m._forward_inside_frozen_module = m.forward
213 |     m.requires_grad_(False)
214 |     m.forward = torch.no_grad()(m.forward)
215 |     return m
216 | 
217 | 
218 | def get_latest_safetensors(folder_path):
219 |     safetensors_files = glob.glob(os.path.join(folder_path, '*.safetensors'))
220 | 
221 |     if not safetensors_files:
222 |         raise ValueError('No file to resume!')
223 | 
224 |     latest_file = max(safetensors_files, key=os.path.getmtime)
225 |     latest_file = os.path.abspath(os.path.realpath(latest_file))
226 |     return latest_file
227 | 
228 | 
229 | def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
230 |     tags = tags_str.split(', ')
231 |     tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
232 |     prompt = ', '.join(tags)
233 |     return prompt
234 | 
235 | 
236 | def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
237 |     numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
238 |     if round_to_int:
239 |         numbers = np.round(numbers).astype(int)
240 |     return numbers.tolist()
241 | 
242 | 
243 | def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
244 |     edges = np.linspace(0, 1, n + 1)
245 |     points = np.random.uniform(edges[:-1], edges[1:])
246 |     numbers = inclusive + (exclusive - inclusive) * points
247 |     if round_to_int:
248 |         numbers = np.round(numbers).astype(int)
249 |     return numbers.tolist()
250 | 
251 | 
252 | def soft_append_bcthw(history, current, overlap=0):
253 |     if overlap <= 0:
254 |         return torch.cat([history, current], dim=2)
255 | 
256 |     assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
257 |     assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
258 |     
259 |     weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
260 |     blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
261 |     output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
262 | 
263 |     return output.to(history)
264 | 
265 | 
266 | def save_bcthw_as_mp4(x, output_filename, fps=10):
267 |     b, c, t, h, w = x.shape
268 | 
269 |     per_row = b
270 |     for p in [6, 5, 4, 3, 2]:
271 |         if b % p == 0:
272 |             per_row = p
273 |             break
274 | 
275 |     os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
276 |     x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
277 |     x = x.detach().cpu().to(torch.uint8)
278 |     x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
279 |     torchvision.io.write_video(output_filename, x, fps=fps, video_codec='h264', options={'crf': '0'})
280 |     return x
281 | 
282 | 
283 | def save_bcthw_as_png(x, output_filename):
284 |     os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
285 |     x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
286 |     x = x.detach().cpu().to(torch.uint8)
287 |     x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
288 |     torchvision.io.write_png(x, output_filename)
289 |     return output_filename
290 | 
291 | 
292 | def save_bchw_as_png(x, output_filename):
293 |     os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
294 |     x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
295 |     x = x.detach().cpu().to(torch.uint8)
296 |     x = einops.rearrange(x, 'b c h w -> c h (b w)')
297 |     torchvision.io.write_png(x, output_filename)
298 |     return output_filename
299 | 
300 | 
301 | def add_tensors_with_padding(tensor1, tensor2):
302 |     if tensor1.shape == tensor2.shape:
303 |         return tensor1 + tensor2
304 | 
305 |     shape1 = tensor1.shape
306 |     shape2 = tensor2.shape
307 | 
308 |     new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
309 | 
310 |     padded_tensor1 = torch.zeros(new_shape)
311 |     padded_tensor2 = torch.zeros(new_shape)
312 | 
313 |     padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
314 |     padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
315 | 
316 |     result = padded_tensor1 + padded_tensor2
317 |     return result
318 | 
319 | 
320 | def print_free_mem():
321 |     torch.cuda.empty_cache()
322 |     free_mem, total_mem = torch.cuda.mem_get_info(0)
323 |     free_mem_mb = free_mem / (1024 ** 2)
324 |     total_mem_mb = total_mem / (1024 ** 2)
325 |     print(f"Free memory: {free_mem_mb:.2f} MB")
326 |     print(f"Total memory: {total_mem_mb:.2f} MB")
327 |     return
328 | 
329 | 
330 | def print_gpu_parameters(device, state_dict, log_count=1):
331 |     summary = {"device": device, "keys_count": len(state_dict)}
332 | 
333 |     logged_params = {}
334 |     for i, (key, tensor) in enumerate(state_dict.items()):
335 |         if i >= log_count:
336 |             break
337 |         logged_params[key] = tensor.flatten()[:3].tolist()
338 | 
339 |     summary["params"] = logged_params
340 | 
341 |     print(str(summary))
342 |     return
343 | 
344 | 
345 | def visualize_txt_as_img(width, height, text, font_path='font/DejaVuSans.ttf', size=18):
346 |     from PIL import Image, ImageDraw, ImageFont
347 | 
348 |     txt = Image.new("RGB", (width, height), color="white")
349 |     draw = ImageDraw.Draw(txt)
350 |     font = ImageFont.truetype(font_path, size=size)
351 | 
352 |     if text == '':
353 |         return np.array(txt)
354 | 
355 |     # Split text into lines that fit within the image width
356 |     lines = []
357 |     words = text.split()
358 |     current_line = words[0]
359 | 
360 |     for word in words[1:]:
361 |         line_with_word = f"{current_line} {word}"
362 |         if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
363 |             current_line = line_with_word
364 |         else:
365 |             lines.append(current_line)
366 |             current_line = word
367 | 
368 |     lines.append(current_line)
369 | 
370 |     # Draw the text line by line
371 |     y = 0
372 |     line_height = draw.textbbox((0, 0), "A", font=font)[3]
373 | 
374 |     for line in lines:
375 |         if y + line_height > height:
376 |             break  # stop drawing if the next line will be outside the image
377 |         draw.text((0, y), line, fill="black", font=font)
378 |         y += line_height
379 | 
380 |     return np.array(txt)
381 | 
382 | 
383 | def blue_mark(x):
384 |     x = x.copy()
385 |     c = x[:, :, 2]
386 |     b = cv2.blur(c, (9, 9))
387 |     x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
388 |     return x
389 | 
390 | 
391 | def green_mark(x):
392 |     x = x.copy()
393 |     x[:, :, 2] = -1
394 |     x[:, :, 0] = -1
395 |     return x
396 | 
397 | 
398 | def frame_mark(x):
399 |     x = x.copy()
400 |     x[:64] = -1
401 |     x[-64:] = -1
402 |     x[:, :8] = 1
403 |     x[:, -8:] = 1
404 |     return x
405 | 
406 | 
407 | @torch.inference_mode()
408 | def pytorch2numpy(imgs):
409 |     results = []
410 |     for x in imgs:
411 |         y = x.movedim(0, -1)
412 |         y = y * 127.5 + 127.5
413 |         y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
414 |         results.append(y)
415 |     return results
416 | 
417 | 
418 | @torch.inference_mode()
419 | def numpy2pytorch(imgs):
420 |     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
421 |     h = h.movedim(-1, 1)
422 |     return h
423 | 
424 | 
425 | @torch.no_grad()
426 | def duplicate_prefix_to_suffix(x, count, zero_out=False):
427 |     if zero_out:
428 |         return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
429 |     else:
430 |         return torch.cat([x, x[:count]], dim=0)
431 | 
432 | 
433 | def weighted_mse(a, b, weight):
434 |     return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
435 | 
436 | 
437 | def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
438 |     x = (x - x_min) / (x_max - x_min)
439 |     x = max(0.0, min(x, 1.0))
440 |     x = x ** sigma
441 |     return y_min + x * (y_max - y_min)
442 | 
443 | 
444 | def expand_to_dims(x, target_dims):
445 |     return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
446 | 
447 | 
448 | def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
449 |     if tensor is None:
450 |         return None
451 | 
452 |     first_dim = tensor.shape[0]
453 | 
454 |     if first_dim == batch_size:
455 |         return tensor
456 | 
457 |     if batch_size % first_dim != 0:
458 |         raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
459 | 
460 |     repeat_times = batch_size // first_dim
461 | 
462 |     return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
463 | 
464 | 
465 | def dim5(x):
466 |     return expand_to_dims(x, 5)
467 | 
468 | 
469 | def dim4(x):
470 |     return expand_to_dims(x, 4)
471 | 
472 | 
473 | def dim3(x):
474 |     return expand_to_dims(x, 3)
475 | 
476 | 
477 | def crop_or_pad_yield_mask(x, length):
478 |     B, F, C = x.shape
479 |     device = x.device
480 |     dtype = x.dtype
481 | 
482 |     if F < length:
483 |         y = torch.zeros((B, length, C), dtype=dtype, device=device)
484 |         mask = torch.zeros((B, length), dtype=torch.bool, device=device)
485 |         y[:, :F, :] = x
486 |         mask[:, :F] = True
487 |         return y, mask
488 | 
489 |     return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
490 | 
491 | 
492 | def extend_dim(x, dim, minimal_length, zero_pad=False):
493 |     original_length = int(x.shape[dim])
494 | 
495 |     if original_length >= minimal_length:
496 |         return x
497 | 
498 |     if zero_pad:
499 |         padding_shape = list(x.shape)
500 |         padding_shape[dim] = minimal_length - original_length
501 |         padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
502 |     else:
503 |         idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
504 |         last_element = x[idx]
505 |         padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
506 | 
507 |     return torch.cat([x, padding], dim=dim)
508 | 
509 | 
510 | def lazy_positional_encoding(t, repeats=None):
511 |     if not isinstance(t, list):
512 |         t = [t]
513 | 
514 |     from diffusers.models.embeddings import get_timestep_embedding
515 | 
516 |     te = torch.tensor(t)
517 |     te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
518 | 
519 |     if repeats is None:
520 |         return te
521 | 
522 |     te = te[:, None, :].expand(-1, repeats, -1)
523 | 
524 |     return te
525 | 
526 | 
527 | def state_dict_offset_merge(A, B, C=None):
528 |     result = {}
529 |     keys = A.keys()
530 | 
531 |     for key in keys:
532 |         A_value = A[key]
533 |         B_value = B[key].to(A_value)
534 | 
535 |         if C is None:
536 |             result[key] = A_value + B_value
537 |         else:
538 |             C_value = C[key].to(A_value)
539 |             result[key] = A_value + B_value - C_value
540 | 
541 |     return result
542 | 
543 | 
544 | def state_dict_weighted_merge(state_dicts, weights):
545 |     if len(state_dicts) != len(weights):
546 |         raise ValueError("Number of state dictionaries must match number of weights")
547 | 
548 |     if not state_dicts:
549 |         return {}
550 | 
551 |     total_weight = sum(weights)
552 | 
553 |     if total_weight == 0:
554 |         raise ValueError("Sum of weights cannot be zero")
555 | 
556 |     normalized_weights = [w / total_weight for w in weights]
557 | 
558 |     keys = state_dicts[0].keys()
559 |     result = {}
560 | 
561 |     for key in keys:
562 |         result[key] = state_dicts[0][key] * normalized_weights[0]
563 | 
564 |         for i in range(1, len(state_dicts)):
565 |             state_dict_value = state_dicts[i][key].to(result[key])
566 |             result[key] += state_dict_value * normalized_weights[i]
567 | 
568 |     return result
569 | 
570 | 
571 | def group_files_by_folder(all_files):
572 |     grouped_files = {}
573 | 
574 |     for file in all_files:
575 |         folder_name = os.path.basename(os.path.dirname(file))
576 |         if folder_name not in grouped_files:
577 |             grouped_files[folder_name] = []
578 |         grouped_files[folder_name].append(file)
579 | 
580 |     list_of_lists = list(grouped_files.values())
581 |     return list_of_lists
582 | 
583 | 
584 | def generate_timestamp():
585 |     now = datetime.datetime.now()
586 |     timestamp = now.strftime('%y%m%d_%H%M%S')
587 |     milliseconds = f"{int(now.microsecond / 1000):03d}"
588 |     random_number = random.randint(0, 9999)
589 |     return f"{timestamp}_{milliseconds}_{random_number}"
590 | 
591 | 
592 | def write_PIL_image_with_png_info(image, metadata, path):
593 |     from PIL.PngImagePlugin import PngInfo
594 | 
595 |     png_info = PngInfo()
596 |     for key, value in metadata.items():
597 |         png_info.add_text(key, value)
598 | 
599 |     image.save(path, "PNG", pnginfo=png_info)
600 |     return image
601 | 
602 | 
603 | def torch_safe_save(content, path):
604 |     torch.save(content, path + '_tmp')
605 |     os.replace(path + '_tmp', path)
606 |     return path
607 | 
608 | 
609 | def move_optimizer_to_device(optimizer, device):
610 |     for state in optimizer.state.values():
611 |         for k, v in state.items():
612 |             if isinstance(v, torch.Tensor):
613 |                 state[k] = v.to(device)
614 | 


--------------------------------------------------------------------------------
/examples/FramePack_endimage.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 18,
  3 |   "last_link_id": 24,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 15,
  7 |       "type": "LoadImage",
  8 |       "pos": [
  9 |         1632.1324462890625,
 10 |         756.5489501953125
 11 |       ],
 12 |       "size": [
 13 |         467.89154052734375,
 14 |         535.3126220703125
 15 |       ],
 16 |       "flags": {},
 17 |       "order": 0,
 18 |       "mode": 0,
 19 |       "inputs": [],
 20 |       "outputs": [
 21 |         {
 22 |           "name": "IMAGE",
 23 |           "label": "IMAGE",
 24 |           "type": "IMAGE",
 25 |           "links": [
 26 |             20
 27 |           ],
 28 |           "slot_index": 0
 29 |         },
 30 |         {
 31 |           "name": "MASK",
 32 |           "label": "MASK",
 33 |           "type": "MASK"
 34 |         }
 35 |       ],
 36 |       "properties": {
 37 |         "cnr_id": "comfy-core",
 38 |         "ver": "0.3.28",
 39 |         "Node name for S&R": "LoadImage"
 40 |       },
 41 |       "widgets_values": [
 42 |         "ComfyUI_temp_ibecr_00005_lqaex_1744979262.png",
 43 |         "image"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "id": 2,
 48 |       "type": "LoadImage",
 49 |       "pos": [
 50 |         1617.58544921875,
 51 |         152.19729614257812
 52 |       ],
 53 |       "size": [
 54 |         467.89154052734375,
 55 |         535.3126220703125
 56 |       ],
 57 |       "flags": {},
 58 |       "order": 1,
 59 |       "mode": 0,
 60 |       "inputs": [],
 61 |       "outputs": [
 62 |         {
 63 |           "name": "IMAGE",
 64 |           "label": "IMAGE",
 65 |           "type": "IMAGE",
 66 |           "links": [
 67 |             21
 68 |           ]
 69 |         },
 70 |         {
 71 |           "name": "MASK",
 72 |           "label": "MASK",
 73 |           "type": "MASK",
 74 |           "slot_index": 1
 75 |         }
 76 |       ],
 77 |       "properties": {
 78 |         "cnr_id": "comfy-core",
 79 |         "ver": "0.3.28",
 80 |         "Node name for S&R": "LoadImage"
 81 |       },
 82 |       "widgets_values": [
 83 |         "ComfyUI_temp_ibecr_00009_kzhcr_1744979266.png",
 84 |         "image"
 85 |       ]
 86 |     },
 87 |     {
 88 |       "id": 18,
 89 |       "type": "PreviewImage",
 90 |       "pos": [
 91 |         2159.90087890625,
 92 |         805.84716796875
 93 |       ],
 94 |       "size": [
 95 |         591.8181762695312,
 96 |         519.8636474609375
 97 |       ],
 98 |       "flags": {},
 99 |       "order": 3,
100 |       "mode": 0,
101 |       "inputs": [
102 |         {
103 |           "name": "images",
104 |           "label": "images",
105 |           "type": "IMAGE",
106 |           "link": 24
107 |         }
108 |       ],
109 |       "outputs": [],
110 |       "properties": {
111 |         "cnr_id": "comfy-core",
112 |         "ver": "0.3.28",
113 |         "Node name for S&R": "PreviewImage"
114 |       },
115 |       "widgets_values": []
116 |     },
117 |     {
118 |       "id": 17,
119 |       "type": "RunningHub_FramePack",
120 |       "pos": [
121 |         2207.928466796875,
122 |         465.5581970214844
123 |       ],
124 |       "size": [
125 |         400,
126 |         252
127 |       ],
128 |       "flags": {},
129 |       "order": 2,
130 |       "mode": 0,
131 |       "inputs": [
132 |         {
133 |           "name": "ref_image",
134 |           "label": "ref_image",
135 |           "type": "IMAGE",
136 |           "link": 21
137 |         },
138 |         {
139 |           "name": "end_image",
140 |           "label": "end_image",
141 |           "type": "IMAGE",
142 |           "shape": 7,
143 |           "link": 20
144 |         }
145 |       ],
146 |       "outputs": [
147 |         {
148 |           "name": "frames",
149 |           "label": "frames",
150 |           "type": "IMAGE",
151 |           "links": [
152 |             22,
153 |             24
154 |           ],
155 |           "slot_index": 0
156 |         },
157 |         {
158 |           "name": "fps",
159 |           "label": "fps",
160 |           "type": "FLOAT",
161 |           "links": [
162 |             23
163 |           ],
164 |           "slot_index": 1
165 |         }
166 |       ],
167 |       "properties": {
168 |         "aux_id": "HM-RunningHub/ComfyUI_RH_FramePack",
169 |         "ver": "c688eb1533f8984a5ea5d2db08496ebb6da0a602",
170 |         "Node name for S&R": "RunningHub_FramePack"
171 |       },
172 |       "widgets_values": [
173 |         "Advanced video dynamic shots\n\n",
174 |         3,
175 |         932,
176 |         "randomize",
177 |         25,
178 |         true,
179 |         1,
180 |         [
181 |           false,
182 |           true
183 |         ]
184 |       ]
185 |     },
186 |     {
187 |       "id": 3,
188 |       "type": "VHS_VideoCombine",
189 |       "pos": [
190 |         2928.890625,
191 |         427.95306396484375
192 |       ],
193 |       "size": [
194 |         472.8837890625,
195 |         820.8837890625
196 |       ],
197 |       "flags": {},
198 |       "order": 4,
199 |       "mode": 0,
200 |       "inputs": [
201 |         {
202 |           "name": "images",
203 |           "label": "images",
204 |           "type": "IMAGE",
205 |           "link": 22
206 |         },
207 |         {
208 |           "name": "audio",
209 |           "label": "audio",
210 |           "type": "AUDIO",
211 |           "shape": 7
212 |         },
213 |         {
214 |           "name": "meta_batch",
215 |           "label": "meta_batch",
216 |           "type": "VHS_BatchManager",
217 |           "shape": 7
218 |         },
219 |         {
220 |           "name": "vae",
221 |           "label": "vae",
222 |           "type": "VAE",
223 |           "shape": 7
224 |         },
225 |         {
226 |           "name": "frame_rate",
227 |           "label": "frame_rate",
228 |           "type": "FLOAT",
229 |           "widget": {
230 |             "name": "frame_rate"
231 |           },
232 |           "link": 23
233 |         }
234 |       ],
235 |       "outputs": [
236 |         {
237 |           "name": "Filenames",
238 |           "label": "Filenames",
239 |           "type": "VHS_FILENAMES"
240 |         }
241 |       ],
242 |       "properties": {
243 |         "cnr_id": "comfyui-videohelpersuite",
244 |         "ver": "df55f01d1df2f7bf5cc772294bc2e6d8bab22d66",
245 |         "Node name for S&R": "VHS_VideoCombine"
246 |       },
247 |       "widgets_values": {
248 |         "frame_rate": 8,
249 |         "loop_count": 0,
250 |         "filename_prefix": "AnimateDiff",
251 |         "format": "video/h264-mp4",
252 |         "pix_fmt": "yuv420p",
253 |         "crf": 19,
254 |         "save_metadata": true,
255 |         "trim_to_audio": false,
256 |         "pingpong": false,
257 |         "save_output": true,
258 |         "videopreview": {
259 |           "paused": false,
260 |           "hidden": false,
261 |           "params": {
262 |             "filename": "AnimateDiff_00004.mp4",
263 |             "workflow": "AnimateDiff_00004.png",
264 |             "fullpath": "D:\\ComfyUI_windows_portable\\ComfyUI\\output\\AnimateDiff_00004.mp4",
265 |             "format": "video/h264-mp4",
266 |             "subfolder": "",
267 |             "type": "output",
268 |             "frame_rate": 30
269 |           },
270 |           "muted": false
271 |         }
272 |       }
273 |     }
274 |   ],
275 |   "links": [
276 |     [
277 |       20,
278 |       15,
279 |       0,
280 |       17,
281 |       1,
282 |       "IMAGE"
283 |     ],
284 |     [
285 |       21,
286 |       2,
287 |       0,
288 |       17,
289 |       0,
290 |       "IMAGE"
291 |     ],
292 |     [
293 |       22,
294 |       17,
295 |       0,
296 |       3,
297 |       0,
298 |       "IMAGE"
299 |     ],
300 |     [
301 |       23,
302 |       17,
303 |       1,
304 |       3,
305 |       4,
306 |       "FLOAT"
307 |     ],
308 |     [
309 |       24,
310 |       17,
311 |       0,
312 |       18,
313 |       0,
314 |       "IMAGE"
315 |     ]
316 |   ],
317 |   "groups": [],
318 |   "config": {},
319 |   "extra": {
320 |     "ds": {
321 |       "scale": 0.8264462809917354,
322 |       "offset": [
323 |         -1427.551752071999,
324 |         -102.40491658569128
325 |       ]
326 |     },
327 |     "ue_links": [],
328 |     "0246.VERSION": [
329 |       0,
330 |       0,
331 |       4
332 |     ],
333 |     "VHS_latentpreview": false,
334 |     "VHS_latentpreviewrate": 0,
335 |     "VHS_MetadataImage": true,
336 |     "VHS_KeepIntermediate": true
337 |   },
338 |   "version": 0.4
339 | }


--------------------------------------------------------------------------------
/examples/FramePack_regular.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 16,
  3 |   "last_link_id": 19,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 7,
  7 |       "type": "SeargePromptText",
  8 |       "pos": [
  9 |         1632.283203125,
 10 |         799.6035766601562
 11 |       ],
 12 |       "size": [
 13 |         400,
 14 |         200
 15 |       ],
 16 |       "flags": {},
 17 |       "order": 0,
 18 |       "mode": 0,
 19 |       "inputs": [],
 20 |       "outputs": [
 21 |         {
 22 |           "name": "prompt",
 23 |           "label": "prompt",
 24 |           "type": "STRING",
 25 |           "links": [
 26 |             16
 27 |           ],
 28 |           "slot_index": 0
 29 |         }
 30 |       ],
 31 |       "properties": {
 32 |         "cnr_id": "SeargeSDXL",
 33 |         "ver": "2eb5edbc712329d77d1a2f5f1e6c5e64397a4a83",
 34 |         "Node name for S&R": "SeargePromptText"
 35 |       },
 36 |       "widgets_values": [
 37 |         "一个女人 带着恋爱般的微笑，对着镜头双手“比心”",
 38 |         [
 39 |           false,
 40 |           true
 41 |         ]
 42 |       ]
 43 |     },
 44 |     {
 45 |       "id": 15,
 46 |       "type": "RunningHub_FramePack",
 47 |       "pos": [
 48 |         2313.555419921875,
 49 |         505.303955078125
 50 |       ],
 51 |       "size": [
 52 |         400,
 53 |         252
 54 |       ],
 55 |       "flags": {},
 56 |       "order": 2,
 57 |       "mode": 0,
 58 |       "inputs": [
 59 |         {
 60 |           "name": "ref_image",
 61 |           "label": "ref_image",
 62 |           "type": "IMAGE",
 63 |           "link": 15
 64 |         },
 65 |         {
 66 |           "name": "end_image",
 67 |           "label": "end_image",
 68 |           "type": "IMAGE",
 69 |           "shape": 7,
 70 |           "link": null
 71 |         },
 72 |         {
 73 |           "name": "prompt",
 74 |           "label": "prompt",
 75 |           "type": "STRING",
 76 |           "widget": {
 77 |             "name": "prompt"
 78 |           },
 79 |           "link": 16
 80 |         }
 81 |       ],
 82 |       "outputs": [
 83 |         {
 84 |           "name": "frames",
 85 |           "label": "frames",
 86 |           "type": "IMAGE",
 87 |           "links": [
 88 |             17
 89 |           ],
 90 |           "slot_index": 0
 91 |         },
 92 |         {
 93 |           "name": "fps",
 94 |           "label": "fps",
 95 |           "type": "FLOAT",
 96 |           "links": [
 97 |             18
 98 |           ],
 99 |           "slot_index": 1
100 |         }
101 |       ],
102 |       "properties": {
103 |         "aux_id": "HM-RunningHub/ComfyUI_RH_FramePack",
104 |         "ver": "c688eb1533f8984a5ea5d2db08496ebb6da0a602",
105 |         "Node name for S&R": "RunningHub_FramePack"
106 |       },
107 |       "widgets_values": [
108 |         "",
109 |         5,
110 |         1378,
111 |         "randomize",
112 |         25,
113 |         true,
114 |         1.2,
115 |         [
116 |           false,
117 |           true
118 |         ]
119 |       ]
120 |     },
121 |     {
122 |       "id": 16,
123 |       "type": "VHS_VideoCombine",
124 |       "pos": [
125 |         2928.94970703125,
126 |         180.33053588867188
127 |       ],
128 |       "size": [
129 |         419.164794921875,
130 |         803.4525146484375
131 |       ],
132 |       "flags": {},
133 |       "order": 3,
134 |       "mode": 0,
135 |       "inputs": [
136 |         {
137 |           "name": "images",
138 |           "label": "images",
139 |           "type": "IMAGE",
140 |           "link": 17
141 |         },
142 |         {
143 |           "name": "audio",
144 |           "label": "audio",
145 |           "type": "AUDIO",
146 |           "shape": 7,
147 |           "link": null
148 |         },
149 |         {
150 |           "name": "meta_batch",
151 |           "label": "meta_batch",
152 |           "type": "VHS_BatchManager",
153 |           "shape": 7,
154 |           "link": null
155 |         },
156 |         {
157 |           "name": "vae",
158 |           "label": "vae",
159 |           "type": "VAE",
160 |           "shape": 7,
161 |           "link": null
162 |         },
163 |         {
164 |           "name": "frame_rate",
165 |           "label": "frame_rate",
166 |           "type": "FLOAT",
167 |           "widget": {
168 |             "name": "frame_rate"
169 |           },
170 |           "link": 18
171 |         }
172 |       ],
173 |       "outputs": [
174 |         {
175 |           "name": "Filenames",
176 |           "label": "Filenames",
177 |           "type": "VHS_FILENAMES"
178 |         }
179 |       ],
180 |       "properties": {
181 |         "cnr_id": "comfyui-videohelpersuite",
182 |         "ver": "df55f01d1df2f7bf5cc772294bc2e6d8bab22d66",
183 |         "Node name for S&R": "VHS_VideoCombine"
184 |       },
185 |       "widgets_values": {
186 |         "frame_rate": 8,
187 |         "loop_count": 0,
188 |         "filename_prefix": "AnimateDiff",
189 |         "format": "video/h264-mp4",
190 |         "pix_fmt": "yuv420p",
191 |         "crf": 19,
192 |         "save_metadata": true,
193 |         "trim_to_audio": false,
194 |         "pingpong": false,
195 |         "save_output": true,
196 |         "videopreview": {
197 |           "paused": false,
198 |           "hidden": false,
199 |           "params": {
200 |             "filename": "AnimateDiff_00009.mp4",
201 |             "workflow": "AnimateDiff_00009.png",
202 |             "fullpath": "D:\\ComfyUI_windows_portable\\ComfyUI\\output\\AnimateDiff_00009.mp4",
203 |             "format": "video/h264-mp4",
204 |             "subfolder": "",
205 |             "type": "output",
206 |             "frame_rate": 30
207 |           },
208 |           "muted": false
209 |         }
210 |       }
211 |     },
212 |     {
213 |       "id": 2,
214 |       "type": "LoadImage",
215 |       "pos": [
216 |         1614.83544921875,
217 |         164.57229614257812
218 |       ],
219 |       "size": [
220 |         467.89154052734375,
221 |         535.3126220703125
222 |       ],
223 |       "flags": {},
224 |       "order": 1,
225 |       "mode": 0,
226 |       "inputs": [],
227 |       "outputs": [
228 |         {
229 |           "name": "IMAGE",
230 |           "label": "IMAGE",
231 |           "type": "IMAGE",
232 |           "links": [
233 |             15
234 |           ],
235 |           "slot_index": 0
236 |         },
237 |         {
238 |           "name": "MASK",
239 |           "label": "MASK",
240 |           "type": "MASK"
241 |         }
242 |       ],
243 |       "properties": {
244 |         "cnr_id": "comfy-core",
245 |         "ver": "0.3.28",
246 |         "Node name for S&R": "LoadImage"
247 |       },
248 |       "widgets_values": [
249 |         "DONGYUJIE.png",
250 |         "image"
251 |       ]
252 |     }
253 |   ],
254 |   "links": [
255 |     [
256 |       15,
257 |       2,
258 |       0,
259 |       15,
260 |       0,
261 |       "IMAGE"
262 |     ],
263 |     [
264 |       16,
265 |       7,
266 |       0,
267 |       15,
268 |       2,
269 |       "STRING"
270 |     ],
271 |     [
272 |       17,
273 |       15,
274 |       0,
275 |       16,
276 |       0,
277 |       "IMAGE"
278 |     ],
279 |     [
280 |       18,
281 |       15,
282 |       1,
283 |       16,
284 |       4,
285 |       "FLOAT"
286 |     ]
287 |   ],
288 |   "groups": [],
289 |   "config": {},
290 |   "extra": {
291 |     "ds": {
292 |       "scale": 0.9090909090909091,
293 |       "offset": [
294 |         -1563.0115557309155,
295 |         -19.100634686877555
296 |       ]
297 |     },
298 |     "ue_links": [],
299 |     "0246.VERSION": [
300 |       0,
301 |       0,
302 |       4
303 |     ],
304 |     "VHS_latentpreview": false,
305 |     "VHS_latentpreviewrate": 0,
306 |     "VHS_MetadataImage": true,
307 |     "VHS_KeepIntermediate": true
308 |   },
309 |   "version": 0.4
310 | }


--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | current_dir = os.path.dirname(os.path.abspath(__file__))
  4 | sys.path.insert(0, current_dir)
  5 | 
  6 | import torch
  7 | import traceback
  8 | import einops
  9 | import safetensors.torch as sf
 10 | import numpy as np
 11 | import argparse
 12 | import math
 13 | import time
 14 | 
 15 | from PIL import Image
 16 | from diffusers import AutoencoderKLHunyuanVideo
 17 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
 18 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
 19 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 20 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 21 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
 22 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 23 | from transformers import SiglipImageProcessor, SiglipVisionModel
 24 | from diffusers_helper.clip_vision import hf_clip_vision_encode
 25 | from diffusers_helper.bucket_tools import find_nearest_bucket
 26 | import hashlib
 27 | import random
 28 | import string
 29 | import torchvision
 30 | from torchvision.transforms.functional import to_pil_image
 31 | import comfy.utils
 32 | 
 33 | from PIL import Image
 34 | import folder_paths
 35 | 
 36 | class Kiki_FramePack:
 37 |     @classmethod
 38 |     def INPUT_TYPES(s):
 39 |         return {
 40 |             "required": {
 41 |                 "ref_image": ("IMAGE", ),
 42 |                 "prompt": ("STRING", {"multiline": True}),
 43 |                 # "n_prompt": ("STRING", {"multiline": True}),
 44 |                 "total_second_length": ("INT", {"default": 5, "min": 1, "max": 120, "step": 1}),
 45 |                 "seed": ("INT", {"default": 3407}),
 46 |                 "steps": ("INT", {"default": 25, "min": 1, "max": 100, "step": 1}),
 47 |                 "use_teacache": ("BOOLEAN", {"default": True}),
 48 |                 "upscale": ("FLOAT", {"default": 1.2, "min": 0.1, "max": 2.0, "step": 0.1, "description": "Resolution scaling factor. 1.0 = original size, >1.0 = upscale, <1.0 = downscale"}),
 49 |             },
 50 |             "optional": {
 51 |                 "end_image": ("IMAGE", ),
 52 |             },
 53 |         }
 54 | 
 55 |     RETURN_TYPES = ("IMAGE", "FLOAT")
 56 |     RETURN_NAMES = ("frames", "fps")
 57 |     CATEGORY = "Runninghub/FramePack"
 58 |     FUNCTION = "run"
 59 | 
 60 |     TITLE = 'RunningHub FramePack'
 61 |     OUTPUT_NODE = True
 62 | 
 63 |     def __init__(self):
 64 |         self.high_vram = False
 65 |         self.frames = None
 66 |         self.fps = None
 67 | 
 68 |         hunyuan_root = os.path.join(folder_paths.models_dir, 'HunyuanVideo')
 69 |         flux_redux_bfl_root = os.path.join(folder_paths.models_dir, 'flux_redux_bfl')
 70 |         framePackI2V_root = os.path.join(folder_paths.models_dir, 'FramePackI2V_HY')
 71 | 
 72 |         self.text_encoder = LlamaModel.from_pretrained(hunyuan_root, subfolder='text_encoder', torch_dtype=torch.float16).cpu()
 73 |         self.text_encoder_2 = CLIPTextModel.from_pretrained(hunyuan_root, subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
 74 |         self.tokenizer = LlamaTokenizerFast.from_pretrained(hunyuan_root, subfolder='tokenizer')
 75 |         self.tokenizer_2 = CLIPTokenizer.from_pretrained(hunyuan_root, subfolder='tokenizer_2')
 76 |         self.vae = AutoencoderKLHunyuanVideo.from_pretrained(hunyuan_root, subfolder='vae', torch_dtype=torch.float16).cpu()
 77 | 
 78 |         self.feature_extractor = SiglipImageProcessor.from_pretrained(flux_redux_bfl_root, subfolder='feature_extractor')
 79 |         self.image_encoder = SiglipVisionModel.from_pretrained(flux_redux_bfl_root, subfolder='image_encoder', torch_dtype=torch.float16).cpu()
 80 | 
 81 |         self.transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(framePackI2V_root, torch_dtype=torch.bfloat16).cpu()
 82 | 
 83 |         self.vae.eval()
 84 |         self.text_encoder.eval()
 85 |         self.text_encoder_2.eval()
 86 |         self.image_encoder.eval()
 87 |         self.transformer.eval()
 88 | 
 89 |         if not self.high_vram:
 90 |             self.vae.enable_slicing()
 91 |             self.vae.enable_tiling()
 92 | 
 93 |         self.transformer.high_quality_fp32_output_for_inference = True
 94 |         print('transformer.high_quality_fp32_output_for_inference = True')
 95 | 
 96 |         self.transformer.to(dtype=torch.bfloat16)
 97 |         self.vae.to(dtype=torch.float16)
 98 |         self.image_encoder.to(dtype=torch.float16)
 99 |         self.text_encoder.to(dtype=torch.float16)
100 |         self.text_encoder_2.to(dtype=torch.float16)
101 | 
102 |         self.vae.requires_grad_(False)
103 |         self.text_encoder.requires_grad_(False)
104 |         self.text_encoder_2.requires_grad_(False)
105 |         self.image_encoder.requires_grad_(False)
106 |         self.transformer.requires_grad_(False)
107 | 
108 |         if not self.high_vram:
109 |             # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
110 |             DynamicSwapInstaller.install_model(self.transformer, device=gpu)
111 |             DynamicSwapInstaller.install_model(self.text_encoder, device=gpu)
112 | 
113 |     def strict_align(self, h, w, scale):
114 |         raw_h = h * scale
115 |         raw_w = w * scale
116 | 
117 |         aligned_h = int(round(raw_h / 64)) * 64
118 |         aligned_w = int(round(raw_w / 64)) * 64
119 | 
120 |         assert (aligned_h % 64 == 0) and (aligned_w % 64 == 0), "尺寸必须是64的倍数"
121 |         assert (aligned_h//8) % 8 == 0 and (aligned_w//8) % 8 == 0, "潜在空间需要8的倍数"
122 |         return aligned_h, aligned_w
123 | 
124 |     def preprocess_image(self, image):
125 |         if image is None:
126 |             return None
127 |         image_np = 255. * image[0].cpu().numpy()
128 |         image = Image.fromarray(np.clip(image_np, 0, 255).astype(np.uint8)).convert("RGB")
129 |         input_image = np.array(image)
130 |         return input_image
131 | 
132 |     def run(self, **kwargs):
133 |         try:
134 |             image = kwargs['ref_image']
135 |             end_image = kwargs.get('end_image', None)  # Use get with None as default
136 |             image_np = self.preprocess_image(image)
137 |             end_image_np = self.preprocess_image(end_image) if end_image is not None else None
138 |             prompt = kwargs['prompt']
139 |             seed = kwargs['seed']
140 |             total_second_length = kwargs['total_second_length']
141 |             steps = kwargs['steps']
142 |             use_teacache = kwargs['use_teacache']
143 |             upscale = kwargs['upscale']
144 |             random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
145 |             video_path = os.path.join(folder_paths.get_output_directory(), f'{random_str}.mp4')
146 | 
147 |             self.pbar = comfy.utils.ProgressBar(steps * total_second_length)
148 | 
149 |             self.exec(input_image=image_np, end_image=end_image_np, prompt=prompt, seed=seed, total_second_length=total_second_length, video_path=video_path, steps=steps, use_teacache=use_teacache, scale=upscale)
150 |             
151 |             if os.path.exists(video_path):
152 |                 self.fps = self.get_fps_with_torchvision(video_path)
153 |                 self.frames = self.extract_frames_as_pil(video_path)
154 |                 print(f'{video_path}:{self.fps} {len(self.frames)}')
155 |             else:
156 |                 self.frames = []
157 |                 self.fps = 0.0
158 |         except Exception as e:
159 |             print(f"Error in run: {str(e)}")
160 |             traceback.print_exc()
161 |             self.frames = []
162 |             self.fps = 0.0
163 | 
164 |         return (self.frames, self.fps)
165 |         
166 |     @torch.no_grad()
167 |     def exec(self, input_image, video_path,
168 |             end_image=None,
169 |             prompt="The girl dances gracefully, with clear movements, full of charm.", 
170 |             n_prompt="", 
171 |             seed=31337, 
172 |             total_second_length=5, 
173 |             latent_window_size=9, 
174 |             steps=25, 
175 |             cfg=1, 
176 |             gs=32, 
177 |             rs=0, 
178 |             gpu_memory_preservation=6, 
179 |             use_teacache=True,
180 |             scale=1.0):
181 |         
182 |         total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
183 |         total_latent_sections = int(max(round(total_latent_sections), 1))
184 | 
185 |         try:
186 |             # Clean GPU
187 |             if not self.high_vram:
188 |                 unload_complete_models(
189 |                     self.text_encoder, self.text_encoder_2, self.image_encoder, self.vae, self.transformer
190 |                 )
191 | 
192 |             # Text encoding
193 |             print('Text encoding')
194 | 
195 |             if not self.high_vram:
196 |                 fake_diffusers_current_device(self.text_encoder, gpu)
197 |                 load_model_as_complete(self.text_encoder_2, target_device=gpu)
198 | 
199 |             llama_vec, clip_l_pooler = encode_prompt_conds(prompt, self.text_encoder, self.text_encoder_2, self.tokenizer, self.tokenizer_2)
200 | 
201 |             if cfg == 1:
202 |                 llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
203 |             else:
204 |                 llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, self.text_encoder, self.text_encoder_2, self.tokenizer, self.tokenizer_2)
205 | 
206 |             llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
207 |             llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
208 | 
209 |             # Processing input image (start frame)
210 |             print('Processing start frame ...')
211 | 
212 |             H, W, C = input_image.shape
213 |             height, width = find_nearest_bucket(H, W, resolution=640)
214 |             print(f"Resized height: {height}, Resized width: {width}")
215 |             
216 |             height, width = self.strict_align(height, width, scale)
217 |             print(f"After Resized height: {height}, Resized width: {width}")
218 |             
219 |             input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
220 |             input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
221 |             input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
222 | 
223 |             # Processing end image if provided
224 |             has_end_image = end_image is not None
225 |             end_image_np = None
226 |             end_image_pt = None
227 |             
228 |             if has_end_image:
229 |                 print('Processing end frame ...')
230 |                 H_end, W_end, C_end = end_image.shape
231 |                 end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
232 |                 end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
233 |                 end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
234 | 
235 |             # VAE encoding
236 |             print('VAE encoding ...')
237 | 
238 |             if not self.high_vram:
239 |                 load_model_as_complete(self.vae, target_device=gpu)
240 | 
241 |             start_latent = vae_encode(input_image_pt, self.vae)
242 |             end_latent = None
243 |             if has_end_image:
244 |                 end_latent = vae_encode(end_image_pt, self.vae)
245 | 
246 |             # CLIP Vision
247 |             print('CLIP Vision encoding ...')
248 | 
249 |             if not self.high_vram:
250 |                 load_model_as_complete(self.image_encoder, target_device=gpu)
251 | 
252 |             # Start image encoding
253 |             image_encoder_output = hf_clip_vision_encode(input_image_np, self.feature_extractor, self.image_encoder)
254 |             image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
255 |             
256 |             # End image encoding if available
257 |             if has_end_image:
258 |                 end_image_encoder_output = hf_clip_vision_encode(end_image_np, self.feature_extractor, self.image_encoder)
259 |                 end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
260 |                 # Use a simple average of embeddings - exactly like in the original code
261 |                 image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
262 | 
263 |             # Dtype
264 |             llama_vec = llama_vec.to(self.transformer.dtype)
265 |             llama_vec_n = llama_vec_n.to(self.transformer.dtype)
266 |             clip_l_pooler = clip_l_pooler.to(self.transformer.dtype)
267 |             clip_l_pooler_n = clip_l_pooler_n.to(self.transformer.dtype)
268 |             image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(self.transformer.dtype)
269 | 
270 |             print('Start Sample')
271 | 
272 |             rnd = torch.Generator("cpu").manual_seed(seed)
273 |             num_frames = latent_window_size * 4 - 3
274 | 
275 |             history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
276 |             history_pixels = None
277 |             total_generated_latent_frames = 0
278 | 
279 |             latent_paddings = list(reversed(range(total_latent_sections)))
280 | 
281 |             if total_latent_sections > 4:
282 |                 latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
283 | 
284 |             for i, latent_padding in enumerate(latent_paddings):
285 |                 is_last_section = latent_padding == 0
286 |                 is_first_section = latent_padding == latent_paddings[0]  # Use the original method
287 |                 latent_padding_size = latent_padding * latent_window_size
288 | 
289 |                 print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
290 | 
291 |                 indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
292 |                 clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
293 |                 clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
294 | 
295 |                 # Always use start_latent for the first position (exactly like in the original code)
296 |                 clean_latents_pre = start_latent.to(history_latents)
297 |                 
298 |                 # For the second position, use history
299 |                 clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
300 |                 
301 |                 # Create clean_latents first
302 |                 clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
303 |                 
304 |                 # Then if we have end_image and this is the first section, override clean_latents_post with end_latent
305 |                 if has_end_image and is_first_section:
306 |                     clean_latents_post = end_latent.to(history_latents)
307 |                     clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
308 | 
309 |                 if not self.high_vram:
310 |                     unload_complete_models()
311 |                     move_model_to_device_with_memory_preservation(self.transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
312 | 
313 |                 if use_teacache:
314 |                     self.transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
315 |                 else:
316 |                     self.transformer.initialize_teacache(enable_teacache=False)
317 | 
318 |                 def callback(d):
319 |                     self.update(1)
320 |                     return
321 | 
322 |                 generated_latents = sample_hunyuan(
323 |                     transformer=self.transformer,
324 |                     sampler='unipc',
325 |                     width=width,
326 |                     height=height,
327 |                     frames=num_frames,
328 |                     real_guidance_scale=cfg,
329 |                     distilled_guidance_scale=gs,
330 |                     guidance_rescale=rs,
331 |                     num_inference_steps=steps,
332 |                     generator=rnd,
333 |                     prompt_embeds=llama_vec,
334 |                     prompt_embeds_mask=llama_attention_mask,
335 |                     prompt_poolers=clip_l_pooler,
336 |                     negative_prompt_embeds=llama_vec_n,
337 |                     negative_prompt_embeds_mask=llama_attention_mask_n,
338 |                     negative_prompt_poolers=clip_l_pooler_n,
339 |                     device=gpu,
340 |                     dtype=torch.bfloat16,
341 |                     image_embeddings=image_encoder_last_hidden_state,
342 |                     latent_indices=latent_indices,
343 |                     clean_latents=clean_latents,
344 |                     clean_latent_indices=clean_latent_indices,
345 |                     clean_latents_2x=clean_latents_2x,
346 |                     clean_latent_2x_indices=clean_latent_2x_indices,
347 |                     clean_latents_4x=clean_latents_4x,
348 |                     clean_latent_4x_indices=clean_latent_4x_indices,
349 |                     callback=callback,
350 |                 )
351 | 
352 |                 # For the last section, add start_latent back to the beginning - just like in the original
353 |                 if is_last_section:
354 |                     generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
355 | 
356 |                 # Accumulate generated frames
357 |                 total_generated_latent_frames += int(generated_latents.shape[2])
358 |                 history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
359 | 
360 |                 if not self.high_vram:
361 |                     offload_model_from_device_for_memory_preservation(self.transformer, target_device=gpu, preserved_memory_gb=8)
362 |                     load_model_as_complete(self.vae, target_device=gpu)
363 | 
364 |                 # Only decode up to the total number of frames we've generated
365 |                 real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
366 | 
367 |                 # Decode latents to pixels
368 |                 if history_pixels is None:
369 |                     history_pixels = vae_decode(real_history_latents, self.vae).cpu()
370 |                 else:
371 |                     # For appending new frames to existing ones
372 |                     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
373 |                     overlapped_frames = latent_window_size * 4 - 3
374 | 
375 |                     current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], self.vae).cpu()
376 |                     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
377 | 
378 |                 if not self.high_vram:
379 |                     unload_complete_models()
380 | 
381 |                 # If this is the last section, save the video
382 |                 if is_last_section:
383 |                     save_bcthw_as_mp4(history_pixels, video_path, fps=30)
384 |                     break
385 | 
386 |         except Exception as e:
387 |             print(f"Error in exec: {str(e)}")
388 |             traceback.print_exc()
389 |         finally:
390 |             unload_complete_models()
391 |         
392 |     def update(self, in_progress):
393 |         self.pbar.update(in_progress)
394 | 
395 |     def extract_frames_as_pil(self, video_path):
396 |         video, _, _ = torchvision.io.read_video(video_path, pts_unit='sec')  # (T, H, W, C)
397 |         frames = [to_pil_image(frame.permute(2, 0, 1)) for frame in video]
398 |         frames = [torch.from_numpy(np.array(frame).astype(np.float32) / 255.0) for frame in frames]
399 |         return frames               
400 | 
401 |     def get_fps_with_torchvision(self, video_path):
402 |         _, _, info = torchvision.io.read_video(video_path, pts_unit='sec')
403 |         return info['video_fps']
404 | 
405 | # --- Start of Kiki_FramePack_F1 Class ---
406 | class Kiki_FramePack_F1:
407 |     @classmethod
408 |     def INPUT_TYPES(s):
409 |         return {
410 |             "required": {
411 |                 "ref_image": ("IMAGE", ),
412 |                 "prompt": ("STRING", {"multiline": True}),
413 |                 "total_second_length": ("INT", {"default": 5, "min": 1, "max": 120, "step": 1}),
414 |                 "fps": ("INT", {"default": 30, "min": 1, "max": 60, "step": 1}),
415 |                 "seed": ("INT", {"default": 3407}),
416 |                 "steps": ("INT", {"default": 25, "min": 1, "max": 100, "step": 1}),
417 |                 "gs": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 32.0, "step": 0.1, "round": 0.01, "label": "Distilled CFG Scale"}),
418 |                 "use_teacache": ("BOOLEAN", {"default": True}),
419 |                 "upscale": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 2.0, "step": 0.1, "description": "Resolution scaling factor."}),
420 |             },
421 |             "optional": {
422 |                  "n_prompt": ("STRING", {"multiline": True, "default": ""}),
423 |             }
424 |         }
425 | 
426 |     RETURN_TYPES = ("IMAGE", "FLOAT")
427 |     RETURN_NAMES = ("frames", "fps")
428 |     CATEGORY = "Runninghub/FramePack"
429 |     FUNCTION = "run_f1"
430 | 
431 |     TITLE = 'RunningHub FramePack F1'
432 |     OUTPUT_NODE = True
433 | 
434 |     def __init__(self):
435 |         self.high_vram = False
436 |         self.frames = None
437 |         self.fps = None
438 | 
439 |         hunyuan_root = os.path.join(folder_paths.models_dir, 'HunyuanVideo')
440 |         flux_redux_bfl_root = os.path.join(folder_paths.models_dir, 'flux_redux_bfl')
441 |         framePackF1_root = os.path.join(folder_paths.models_dir, 'FramePackF1_HY')
442 |         
443 |         if not os.path.isdir(framePackF1_root):
444 |              print(f"Warning: FramePack F1 model directory not found at {framePackF1_root}")
445 | 
446 |         self.text_encoder = LlamaModel.from_pretrained(hunyuan_root, subfolder='text_encoder', torch_dtype=torch.float16).cpu()
447 |         self.text_encoder_2 = CLIPTextModel.from_pretrained(hunyuan_root, subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
448 |         self.tokenizer = LlamaTokenizerFast.from_pretrained(hunyuan_root, subfolder='tokenizer')
449 |         self.tokenizer_2 = CLIPTokenizer.from_pretrained(hunyuan_root, subfolder='tokenizer_2')
450 |         self.vae = AutoencoderKLHunyuanVideo.from_pretrained(hunyuan_root, subfolder='vae', torch_dtype=torch.float16).cpu()
451 | 
452 |         self.feature_extractor = SiglipImageProcessor.from_pretrained(flux_redux_bfl_root, subfolder='feature_extractor')
453 |         self.image_encoder = SiglipVisionModel.from_pretrained(flux_redux_bfl_root, subfolder='image_encoder', torch_dtype=torch.float16).cpu()
454 | 
455 |         try:
456 |             self.transformer_f1 = HunyuanVideoTransformer3DModelPacked.from_pretrained(framePackF1_root, torch_dtype=torch.bfloat16).cpu()
457 |         except Exception as e:
458 |              print(f"Error loading FramePack F1 transformer model from {framePackF1_root}: {e}")
459 |              print("Please ensure the F1 model weights (e.g., transformer.safetensors) are correctly placed in the directory.")
460 |              self.transformer_f1 = None
461 | 
462 |         self.vae.eval()
463 |         self.text_encoder.eval()
464 |         self.text_encoder_2.eval()
465 |         self.image_encoder.eval()
466 |         if self.transformer_f1:
467 |              self.transformer_f1.eval()
468 | 
469 |         if not self.high_vram:
470 |             self.vae.enable_slicing()
471 |             self.vae.enable_tiling()
472 | 
473 |         if self.transformer_f1:
474 |              self.transformer_f1.high_quality_fp32_output_for_inference = True
475 |              print('F1 transformer.high_quality_fp32_output_for_inference = True')
476 | 
477 |              self.transformer_f1.to(dtype=torch.bfloat16)
478 | 
479 |              self.transformer_f1.requires_grad_(False)
480 | 
481 |              if not self.high_vram:
482 |                  DynamicSwapInstaller.install_model(self.transformer_f1, device=gpu)
483 | 
484 |         self.vae.to(dtype=torch.float16)
485 |         self.image_encoder.to(dtype=torch.float16)
486 |         self.text_encoder.to(dtype=torch.float16)
487 |         self.text_encoder_2.to(dtype=torch.float16)
488 |         self.vae.requires_grad_(False)
489 |         self.text_encoder.requires_grad_(False)
490 |         self.text_encoder_2.requires_grad_(False)
491 |         self.image_encoder.requires_grad_(False)
492 | 
493 |         if not self.high_vram:
494 |              DynamicSwapInstaller.install_model(self.text_encoder, device=gpu)
495 | 
496 |     def strict_align(self, h, w, scale):
497 |         raw_h = h * scale
498 |         raw_w = w * scale
499 |         aligned_h = int(round(raw_h / 64)) * 64
500 |         aligned_w = int(round(raw_w / 64)) * 64
501 |         assert (aligned_h % 64 == 0) and (aligned_w % 64 == 0), "尺寸必须是64的倍数"
502 |         assert (aligned_h//8) % 8 == 0 and (aligned_w//8) % 8 == 0, "潜在空间需要8的倍数"
503 |         return aligned_h, aligned_w
504 | 
505 |     def preprocess_image(self, image):
506 |         if image is None: return None
507 |         if image.dim() == 4 and image.shape[0] == 1:
508 |              img_tensor = image[0]
509 |         else:
510 |              img_tensor = image 
511 |              print(f"Warning: Unexpected input image tensor shape: {image.shape}. Assuming HWC.")
512 |              
513 |         image_np = 255. * img_tensor.cpu().numpy()
514 |         image = Image.fromarray(np.clip(image_np, 0, 255).astype(np.uint8)).convert("RGB")
515 |         input_image = np.array(image)
516 |         return input_image
517 |         
518 |     def run_f1(self, **kwargs):
519 |         if not self.transformer_f1:
520 |             print("Error: Kiki_FramePack_F1 cannot run because the transformer model failed to load.")
521 |             return (torch.empty((0, 1, 1, 3), dtype=torch.float32), 0.0)
522 | 
523 |         try:
524 |             image = kwargs['ref_image']
525 |             image_np = self.preprocess_image(image)
526 |             prompt = kwargs['prompt']
527 |             n_prompt = kwargs.get('n_prompt', "")
528 |             seed = kwargs['seed']
529 |             total_second_length = kwargs['total_second_length']
530 |             fps = kwargs['fps']
531 |             steps = kwargs['steps']
532 |             gs = kwargs['gs']
533 |             use_teacache = kwargs['use_teacache']
534 |             upscale = kwargs['upscale']
535 |             cfg = 1.0
536 |             rs = 0.0
537 |             latent_window_size = 9
538 | 
539 |             random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
540 |             video_path = os.path.join(folder_paths.get_output_directory(), f'{random_str}_f1.mp4')
541 | 
542 |             # --- Initialize Progress Bar (Aligned with demo's section calc) ---
543 |             # Use demo's calculation for total_latent_sections, assuming 30fps basis for consistency
544 |             total_latent_sections = int(max(round((total_second_length * 30) / (latent_window_size * 4)), 1))
545 |             total_progress_steps = total_latent_sections * steps
546 |             self.pbar = comfy.utils.ProgressBar(total_progress_steps)
547 | 
548 |             # Call exec_f1, passing latent_window_size as well
549 |             self.exec_f1(input_image=image_np, prompt=prompt, n_prompt=n_prompt, seed=seed,
550 |                          total_second_length=total_second_length, video_path=video_path, fps=fps,
551 |                          steps=steps, gs=gs, cfg=cfg, rs=rs, latent_window_size=latent_window_size, # Pass latent_window_size
552 |                          use_teacache=use_teacache, scale=upscale,
553 |                          gpu_memory_preservation=6)
554 | 
555 |             if os.path.exists(video_path):
556 |                 self.fps = float(fps)
557 |                 self.frames = self.extract_frames_to_tensor(video_path)
558 |                 print(f'F1 Video saved: {video_path} | FPS: {self.fps} | Frames: {self.frames.shape[0] if self.frames is not None else 0}')
559 |             else:
560 |                 self.frames = torch.empty((0, 1, 1, 3), dtype=torch.float32)
561 |                 self.fps = 0.0
562 |                 print(f'F1 Video generation failed or file not found: {video_path}')
563 | 
564 |         except Exception as e:
565 |             print(f"Error in run_f1: {str(e)}")
566 |             traceback.print_exc()
567 |             self.frames = torch.empty((0, 1, 1, 3), dtype=torch.float32)
568 |             self.fps = 0.0
569 | 
570 |         return (self.frames, self.fps)
571 | 
572 |     @torch.no_grad()
573 |     def exec_f1(self, input_image, video_path,
574 |                 prompt, n_prompt, seed, total_second_length, fps,
575 |                 steps, gs, cfg, rs, latent_window_size, # Receive latent_window_size
576 |                 use_teacache, scale,
577 |                 gpu_memory_preservation=6):
578 | 
579 |         print("--- Starting Kiki_FramePack_F1 exec_f1 (Aligned with Demo Logic) ---")
580 |         print(f"Params: seed={seed}, length={total_second_length}s@{fps}fps, steps={steps}, gs={gs}, cfg={cfg}, rs={rs}, lws={latent_window_size}")
581 | 
582 |         vae_time_stride = 4
583 | 
584 |         # --- Use Demo's total_latent_sections calculation --- 
585 |         total_latent_sections = int(max(round((total_second_length * 30) / (latent_window_size * 4)), 1))
586 |         print(f"Total generation sections (Demo calc): {total_latent_sections}")
587 | 
588 |         # --- Calculate target frames needed (still useful for trimming) ---
589 |         target_pixel_frames = int(round(total_second_length * fps))
590 | 
591 |         try:
592 |             # --- 1. Initialization & Setup --- 
593 |             torch.manual_seed(seed)
594 |             rnd = torch.Generator("cpu").manual_seed(seed)
595 | 
596 |             # ... (Unload models if needed) ...
597 | 
598 |             # --- 2. Encoding Inputs --- 
599 |             print('Encoding text prompts...')
600 |             if not self.high_vram:
601 |                 fake_diffusers_current_device(self.text_encoder, gpu)
602 |                 load_model_as_complete(self.text_encoder_2, target_device=gpu)
603 |             llama_vec, clip_l_pooler = encode_prompt_conds(prompt, self.text_encoder, self.text_encoder_2, self.tokenizer, self.tokenizer_2)
604 |             llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, self.text_encoder, self.text_encoder_2, self.tokenizer, self.tokenizer_2)
605 |             llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
606 |             llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
607 | 
608 |             print('Processing reference image...')
609 |             H, W, C = input_image.shape
610 |             if scale == 1.0:
611 |                  height, width = find_nearest_bucket(H, W, resolution=640)
612 |                  height, width = self.strict_align(height, width, 1.0)
613 |             else:
614 |                  height, width = self.strict_align(H, W, scale)
615 |             print(f"Target dimensions: {width}x{height}")
616 |             input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
617 |             input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
618 |             input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
619 | 
620 |             print('VAE encoding reference image...')
621 |             if not self.high_vram: load_model_as_complete(self.vae, target_device=gpu)
622 |             start_latent = vae_encode(input_image_pt.to(self.vae.device, dtype=self.vae.dtype), self.vae)
623 |             print(f"Start latent shape: {start_latent.shape}")
624 | 
625 |             print('CLIP Vision encoding reference image...')
626 |             if not self.high_vram: load_model_as_complete(self.image_encoder, target_device=gpu)
627 |             image_encoder_output = hf_clip_vision_encode(input_image_np, self.feature_extractor, self.image_encoder.to(gpu))
628 |             image_embeddings = image_encoder_output.last_hidden_state
629 | 
630 |             transformer_dtype = self.transformer_f1.dtype
631 |             start_latent = start_latent.to(transformer_dtype).cpu()
632 | 
633 |             # --- 3. Diffusion Loop (Aligned with Demo) --- 
634 |             print(f'Starting diffusion loop for {total_latent_sections} sections...')
635 | 
636 |             latent_channels = start_latent.shape[1]
637 |             latent_height = start_latent.shape[-2]
638 |             latent_width = start_latent.shape[-1]
639 |             history_context_size = 16 + 2 + 1
640 | 
641 |             # --- Initialize history_latents like demo --- 
642 |             # Start with zeros matching context size
643 |             history_latents = torch.zeros(size=(1, latent_channels, history_context_size, latent_height, latent_width), dtype=torch.float32).cpu() # Use float32 like demo?
644 |             # Immediately add start_latent
645 |             history_latents = torch.cat([history_latents, start_latent.to(history_latents.dtype)], dim=2)
646 |             total_generated_latent_frames = 1 # Account for start_latent
647 |             history_pixels = None
648 | 
649 |             # ... (Progress bar callback setup) ...
650 |             current_section_step = 0
651 |             total_progress_steps = total_latent_sections * steps
652 |             def callback_f1(d):
653 |                  # ... (Update pbar logic remains the same) ...
654 |                  nonlocal current_section_step
655 |                  step_in_section = d['i']
656 |                  current_total_step = current_section_step * steps + step_in_section + 1
657 |                  if hasattr(self, 'pbar') and self.pbar:
658 |                      self.pbar.update_absolute(current_total_step, total_progress_steps)
659 | 
660 |             # Calculate frames generated per step based on demo
661 |             frames_per_latent_window = latent_window_size * 4 - 3
662 | 
663 |             for section_index in range(total_latent_sections):
664 |                 section_start_time = time.time()
665 |                 print(f'Generating section {section_index + 1} / {total_latent_sections}')
666 |                 current_section_step = section_index
667 | 
668 |                 # ... (Load transformer if needed) ...
669 | 
670 |                 # --- Prepare context and indices (same as before, uses history_latents) --- 
671 |                 indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
672 |                 clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
673 |                 clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
674 | 
675 |                 # Get history context from the *end* of the current history_latents
676 |                 # No padding needed here because history starts with context + start_latent
677 |                 history_context = history_latents[:, :, -history_context_size:, :, :]
678 |                 clean_latents_4x, clean_latents_2x, clean_latents_1x = history_context.split([16, 2, 1], dim=2)
679 |                 clean_latents = torch.cat([start_latent.cpu(), clean_latents_1x.cpu()], dim=2)
680 | 
681 |                 # --- Prepare sample_kwargs (same as before) --- 
682 |                 sample_kwargs = dict(
683 |                     transformer=self.transformer_f1,
684 |                     sampler='unipc',
685 |                     width=width,
686 |                     height=height,
687 |                     frames=frames_per_latent_window, # Use demo's frame count
688 |                     real_guidance_scale=cfg,
689 |                     distilled_guidance_scale=gs,
690 |                     guidance_rescale=rs,
691 |                     num_inference_steps=steps,
692 |                     generator=rnd,
693 |                     # --- Add missing positive prompt embeddings & ENSURE DTYPE --- 
694 |                     prompt_embeds=llama_vec.to(gpu, dtype=transformer_dtype),
695 |                     prompt_embeds_mask=llama_attention_mask.to(gpu), # Mask dtype usually okay
696 |                     # --- Existing embeddings/poolers & ENSURE DTYPE --- 
697 |                     prompt_poolers=clip_l_pooler.to(gpu, dtype=transformer_dtype),
698 |                     negative_prompt_embeds=llama_vec_n.to(gpu, dtype=transformer_dtype),
699 |                     negative_prompt_embeds_mask=llama_attention_mask_n.to(gpu), # Mask dtype usually okay
700 |                     negative_prompt_poolers=clip_l_pooler_n.to(gpu, dtype=transformer_dtype),
701 |                     device=gpu, # Device is already GPU
702 |                     dtype=transformer_dtype, # Explicitly passing transformer's dtype
703 |                     image_embeddings=image_embeddings.to(gpu, dtype=transformer_dtype),
704 |                     latent_indices=latent_indices.to(gpu), # Indices dtype usually okay
705 |                     clean_latents=clean_latents.to(gpu, dtype=transformer_dtype), # Ensure correct dtype
706 |                     clean_latent_indices=clean_latent_indices.to(gpu), # Indices dtype usually okay
707 |                     clean_latents_2x=clean_latents_2x.to(gpu, dtype=transformer_dtype), # Ensure correct dtype
708 |                     clean_latent_2x_indices=clean_latent_2x_indices.to(gpu), # Indices dtype usually okay
709 |                     clean_latents_4x=clean_latents_4x.to(gpu, dtype=transformer_dtype), # Ensure correct dtype
710 |                     clean_latent_4x_indices=clean_latent_4x_indices.to(gpu), # Indices dtype usually okay
711 |                     callback=callback_f1,
712 |                 )
713 | 
714 |                 # ... (Initialize teacache) ...
715 |                 if hasattr(self.transformer_f1, 'initialize_teacache'):
716 |                     self.transformer_f1.initialize_teacache(enable_teacache=use_teacache, num_steps=steps)
717 | 
718 |                 # --- Call sample_hunyuan --- 
719 |                 generated_latents = sample_hunyuan(**sample_kwargs)
720 | 
721 |                 generated_latents = generated_latents.to(cpu, dtype=torch.float32)
722 |                 print(f"  Sampled latent section shape: {generated_latents.shape}")
723 | 
724 |                 # --- Update history_latents (Aligned with Demo: Always append) --- 
725 |                 total_generated_latent_frames += int(generated_latents.shape[2])
726 |                 history_latents = torch.cat([history_latents, generated_latents.to(history_latents.dtype)], dim=2)
727 | 
728 |                 # --- Decode and append pixels (Aligned with Demo) --- 
729 |                 if not self.high_vram:
730 |                     offload_model_from_device_for_memory_preservation(self.transformer_f1, target_device=gpu, preserved_memory_gb=8)
731 |                     load_model_as_complete(self.vae, target_device=gpu)
732 |                 else:
733 |                      if self.vae.device != gpu: self.vae.to(gpu)
734 | 
735 |                 # Calculate the slice of history to decode based on total generated frames
736 |                 real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] # Use actual generated frames
737 | 
738 |                 if history_pixels is None:
739 |                     # First time: decode the current relevant history
740 |                     history_pixels = vae_decode(real_history_latents.to(gpu, dtype=self.vae.dtype), self.vae).cpu()
741 |                     print(f"  Decoded initial pixels. Shape: {history_pixels.shape}")
742 |                 else:
743 |                     # Subsequent times: decode only the part needed for smooth append
744 |                     section_latent_frames = latent_window_size * 2
745 |                     overlapped_frames = latent_window_size * 4 - 3 # Use demo's overlap calculation
746 | 
747 |                     # Decode the relevant tail end of the history latents
748 |                     current_latents_to_decode = real_history_latents[:, :, -section_latent_frames:, :, :]
749 |                     current_pixels = vae_decode(current_latents_to_decode.to(gpu, dtype=self.vae.dtype), self.vae).cpu()
750 | 
751 |                     # Append smoothly using demo's overlap value
752 |                     history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
753 |                     print(f"  Appended pixels. New history shape: {history_pixels.shape}")
754 | 
755 |                 # ... (Unload VAE if needed) ...
756 |                 if not self.high_vram:
757 |                     unload_complete_models(self.vae)
758 | 
759 |                 section_end_time = time.time()
760 |                 print(f"  Section {section_index + 1} took {section_end_time - section_start_time:.2f} seconds.")
761 | 
762 |             # --- 4. Final Saving (Aligned with Demo, keeping variable fps) --- 
763 |             print('Saving final video...')
764 |             if history_pixels is None or history_pixels.shape[2] == 0:
765 |                  raise ValueError("No pixel frames were generated or decoded.")
766 | 
767 |             if history_pixels.shape[2] > target_pixel_frames:
768 |                  print(f"Trimming final video from {history_pixels.shape[2]} to {target_pixel_frames} frames.")
769 |                  history_pixels = history_pixels[:,:,:target_pixel_frames,:,:]
770 | 
771 |             save_bcthw_as_mp4(
772 |                 history_pixels,
773 |                 video_path,
774 |                 fps=fps, # Keep user FPS for now
775 |                 # crf=18 # Omit crf until utils.py is confirmed synced
776 |             )
777 |             print(f"Final video saved to: {video_path}")
778 | 
779 |         except Exception as e:
780 |             print(f"Error during Kiki_FramePack_F1 execution: {str(e)}")
781 |             traceback.print_exc()
782 |             if os.path.exists(video_path):
783 |                 try: os.remove(video_path)
784 |                 except OSError: pass
785 |             if hasattr(self, 'pbar') and self.pbar: self.pbar.update_absolute(total_progress_steps, total_progress_steps)
786 |             raise
787 | 
788 |         finally:
789 |             print('Cleaning up models...')
790 |             unload_complete_models(
791 |                 self.text_encoder, self.text_encoder_2, self.image_encoder, self.vae, self.transformer_f1
792 |             )
793 |             torch.cuda.empty_cache()
794 |             print("--- Finished Kiki_FramePack_F1 exec_f1 (Aligned with Demo Logic) ---")
795 | 
796 |     def extract_frames_to_tensor(self, video_path):
797 |         try:
798 |             video_tensor, _, metadata = torchvision.io.read_video(video_path, pts_unit='sec', output_format='TCHW')
799 |             
800 |             video_tensor = video_tensor.permute(0, 2, 3, 1)
801 |             
802 |             video_tensor = video_tensor.float() / 255.0
803 |             
804 |             print(f"Extracted video tensor shape: {video_tensor.shape}")
805 |             return video_tensor
806 | 
807 |         except Exception as e:
808 |             print(f"Error extracting frames using torchvision.io.read_video: {e}")
809 |             traceback.print_exc()
810 |             return torch.empty((0, 1, 1, 3), dtype=torch.float32)
811 | 
812 |     def get_fps_with_torchvision(self, video_path):
813 |         try:
814 |             _, _, metadata = torchvision.io.read_video(video_path, pts_unit='sec')
815 |             fps = metadata.get('video_fps', 30.0)
816 |             return float(fps)
817 |         except Exception as e:
818 |             print(f"Error reading FPS using torchvision.io.read_video: {e}")
819 |             traceback.print_exc()
820 |             return 30.0
821 | 
822 | # NODE CLASS MAPPINGS
823 | NODE_CLASS_MAPPINGS = {
824 |     "RunningHub_FramePack": Kiki_FramePack,
825 |     "RunningHub_FramePack_F1": Kiki_FramePack_F1
826 | }
827 | 
828 | # A dictionary that contains the friendly/humanly readable titles for the nodes
829 | NODE_DISPLAY_NAME_MAPPINGS = {
830 |     "RunningHub_FramePack": Kiki_FramePack.TITLE,
831 |     "RunningHub_FramePack_F1": Kiki_FramePack_F1.TITLE
832 | }
833 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | numpy
 4 | Pillow
 5 | diffusers>=0.33.1
 6 | transformers>=4.46.2
 7 | einops
 8 | safetensors
 9 | accelerate>=1.6.0
10 | scipy>=1.12.0
11 | torchsde>=0.2.6
12 | opencv-python
13 | 


--------------------------------------------------------------------------------