├── icon.png
├── README.md
├── reset.js
├── link.js
├── update.js
├── start.js
├── start_f1.js
├── start_k.js
├── pinokio_meta.json
├── install.js
├── torch.js
├── pinokio.js
├── demo_gradio_f1.py
└── demo_gradio_k.py


/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinokiofactory/Frame-Pack/HEAD/icon.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FramePack
2 | 
3 | A pinokio script for https://github.com/lllyasviel/FramePack
4 | 
5 | 


--------------------------------------------------------------------------------
/reset.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   run: [{
3 |     method: "fs.rm",
4 |     params: {
5 |       path: "app"
6 |     }
7 |   }]
8 | }
9 | 


--------------------------------------------------------------------------------
/link.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   run: [
 3 |     {
 4 |       method: "fs.link",
 5 |       params: {
 6 |         venv: "app/env"
 7 |       }
 8 |     }
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------
/update.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   run: [{
 3 |     method: "shell.run",
 4 |     params: {
 5 |       message: "git pull"
 6 |     }
 7 |   }, {
 8 |     method: "shell.run",
 9 |     params: {
10 |       path: "app",
11 |       message: "git pull"
12 |     }
13 |   }]
14 | }
15 | 


--------------------------------------------------------------------------------
/start.js:
--------------------------------------------------------------------------------
 1 | module.exports = async (kernel) => {
 2 |   const port = await kernel.port()
 3 |   
 4 |   console.log("Starting standard version directly")
 5 |   const scriptToRun = "demo_gradio.py"
 6 |   
 7 |   return {
 8 |     requires: {
 9 |       bundle: "ai",
10 |     },
11 |     daemon: true,
12 |     run: [
13 |       {
14 |         method: "shell.run",
15 |         params: {
16 |           venv: "env",
17 |           env: { },
18 |           path: "app",
19 |           message: [
20 |             `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 |           ],
22 |           on: [{
23 |             "event": "/http:\/\/[0-9.:]+/",
24 |             "done": true
25 |           }]
26 |         }
27 |       },
28 |       {
29 |         method: "local.set",
30 |         params: {
31 |           url: "{{input.event[0]}}"
32 |         }
33 |       }
34 |     ]
35 |   };
36 | };
37 | 


--------------------------------------------------------------------------------
/start_f1.js:
--------------------------------------------------------------------------------
 1 | module.exports = async (kernel) => {
 2 |   const port = await kernel.port()
 3 |   
 4 |   console.log("Starting F1 version directly")
 5 |   const scriptToRun = "demo_gradio_f1.py"
 6 |   
 7 |   return {
 8 |     requires: {
 9 |       bundle: "ai",
10 |     },
11 |     daemon: true,
12 |     run: [
13 |       {
14 |         method: "shell.run",
15 |         params: {
16 |           venv: "env",
17 |           env: { },
18 |           path: "app",
19 |           message: [
20 |             `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 |           ],
22 |           on: [{
23 |             "event": "/http:\/\/\\S+/",
24 |             "done": true
25 |           }]
26 |         }
27 |       },
28 |       {
29 |         method: "local.set",
30 |         params: {
31 |           url: "{{input.event[0]}}"
32 |         }
33 |       }
34 |     ]
35 |   };
36 | 
37 | }; 
38 | 


--------------------------------------------------------------------------------
/start_k.js:
--------------------------------------------------------------------------------
 1 | module.exports = async (kernel) => {
 2 |     const port = await kernel.port()
 3 |     
 4 |     console.log("Starting standard version directly")
 5 |     const scriptToRun = "demo_gradio_k.py"
 6 |     
 7 |     return {
 8 |       requires: {
 9 |         bundle: "ai",
10 |       },
11 |       daemon: true,
12 |       run: [
13 |         {
14 |           method: "shell.run",
15 |           params: {
16 |             venv: "env",
17 |             env: { },
18 |             path: "app",
19 |             message: [
20 |               `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 |             ],
22 |             on: [{
23 |               "event": "/http:\/\/[0-9.:]+/",
24 |               "done": true
25 |             }]
26 |           }
27 |         },
28 |         {
29 |           method: "local.set",
30 |           params: {
31 |             url: "{{input.event[0]}}"
32 |           }
33 |         }
34 |       ]
35 |     };
36 |   };
37 | 


--------------------------------------------------------------------------------
/pinokio_meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "posts": [
 3 |     "https://x.com/cocktailpeanut/status/1913004350970159464",
 4 |     "https://x.com/cocktailpeanut/status/1912977698642907614",
 5 |     "https://x.com/joesparks/status/1912986750282272861",
 6 |     "https://x.com/matze2001/status/1912989768406679901",
 7 |     "https://x.com/cocktailpeanut/status/1912998525274853592",
 8 |     "https://x.com/cocktailpeanut/status/1913003111050018864",
 9 |     "https://x.com/SUP3RMASS1VE/status/1912837447525822592",
10 |     "https://x.com/SUP3RMASS1VE/status/1912842398654210420",
11 |     "https://x.com/SUP3RMASS1VE/status/1912975317896921253"
12 |   ],
13 |   "links": [{
14 |     "title": "SUP3RMASS1VE (Wrote the launcher)",
15 |     "links": [{
16 |       "type": "bitcoin",
17 |       "value": "1N942jHr6vVuR2KAe2JEf3nN59eR21tpKv"
18 |     }, {
19 |       "title": "X",
20 |       "value": "https://x.com/SUP3RMASS1VE"
21 |     }, {
22 |       "title": "Github",
23 |       "value": "https://github.com/SUP3RMASS1VE"
24 |     }, {
25 |       "title": "Discord",
26 |       "value": "https://discord.gg/mvDcrA57AQ"
27 |     }]
28 |   }]
29 | }
30 | 


--------------------------------------------------------------------------------
/install.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   requires: {
 3 |     bundle: "ai",
 4 |   },
 5 |   run: [
 6 |     {
 7 |       method: "shell.run",
 8 |       params: {
 9 |         message: [
10 |           "git clone https://github.com/lllyasviel/FramePack app",
11 |         ]
12 |       }
13 |     },
14 |     {
15 |       when: "{{platform === 'win32'}}",
16 |       method: "shell.run",
17 |       params: {
18 |         message: [
19 |           "copy /Y demo_gradio_k.py app",
20 |           "copy /Y demo_gradio_f1.py app"
21 |         ]
22 |       },
23 |     },
24 |     {
25 |       when: "{{platform !== 'win32'}}",
26 |       method: "shell.run",
27 |       params: {
28 |         message: [
29 |           "cp -f demo_gradio_k.py app",
30 |           "cp -f demo_gradio_f1.py app"
31 |         ]
32 |       },
33 |     },
34 |     {
35 |       method: "script.start",
36 |       params: {
37 |         uri: "torch.js",
38 |         params: {
39 |           venv: "env",
40 |           path: "app",
41 |           // xformers: true,
42 |           // triton: true,
43 |           // sageattention: true
44 |         }
45 |       }
46 |     },
47 |     {
48 |       method: "shell.run",
49 |       params: {
50 |         venv: "env",
51 |         path: "app",
52 |         message: [
53 |           "uv pip install gradio devicetorch",
54 |           "uv pip install -r requirements.txt",
55 |           "uv pip install hf_xet"
56 |         ]
57 |       }
58 |     },
59 |     {
60 |       method: 'input',
61 |       params: {
62 |         title: 'Installation completed',
63 |         description: 'Click "Start FramePack (Standard)", "Start FramePack (F1)" or "Start FramePack (Key Frame)" in the left menu to launch the app manually'
64 |       }
65 |     },
66 |   ]
67 | }
68 | 


--------------------------------------------------------------------------------
/torch.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   run: [
 3 |     // windows nvidia
 4 |     {
 5 |       "when": "{{platform === 'win32' && gpu === 'nvidia'}}",
 6 |       "method": "shell.run",
 7 |       "params": {
 8 |         "venv": "{{args && args.venv ? args.venv : null}}",
 9 |         "path": "{{args && args.path ? args.path : '.'}}",
10 |         "message": [
11 |           "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps",
12 |           "uv pip install triton-windows==3.3.1.post19",
13 |           "uv pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl", 
14 |         ]
15 |       }
16 |     },
17 |     // windows amd
18 |     {
19 |       "when": "{{platform === 'win32' && gpu === 'amd'}}",
20 |       "method": "shell.run",
21 |       "params": {
22 |         "venv": "{{args && args.venv ? args.venv : null}}",
23 |         "path": "{{args && args.path ? args.path : '.'}}",
24 |         "message": "uv pip install torch-directml torchaudio torchvision numpy==1.26.4"
25 |       }
26 |     },
27 |     // windows cpu
28 |     {
29 |       "when": "{{platform === 'win32' && (gpu !== 'nvidia' && gpu !== 'amd')}}",
30 |       "method": "shell.run",
31 |       "params": {
32 |         "venv": "{{args && args.venv ? args.venv : null}}",
33 |         "path": "{{args && args.path ? args.path : '.'}}",
34 |         "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 numpy==1.26.4"
35 |       }
36 |     },
37 |     // mac
38 |     {
39 |       "when": "{{platform === 'darwin'}}",
40 |       "method": "shell.run",
41 |       "params": {
42 |         "venv": "{{args && args.venv ? args.venv : null}}",
43 |         "path": "{{args && args.path ? args.path : '.'}}",
44 |         "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1"
45 |       }
46 |     },
47 |     // linux nvidia
48 |     {
49 |       "when": "{{platform === 'linux' && gpu === 'nvidia'}}",
50 |       "method": "shell.run",
51 |       "params": {
52 |         "venv": "{{args && args.venv ? args.venv : null}}",
53 |         "path": "{{args && args.path ? args.path : '.'}}",
54 |         "message": [
55 |           "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps",
56 |           "uv pip install triton",
57 |           "uv pip install git+https://github.com/thu-ml/SageAttention.git",
58 |         ]
59 |       }
60 |     },
61 |     // linux rocm (amd)
62 |     {
63 |       "when": "{{platform === 'linux' && gpu === 'amd'}}",
64 |       "method": "shell.run",
65 |       "params": {
66 |         "venv": "{{args && args.venv ? args.venv : null}}",
67 |         "path": "{{args && args.path ? args.path : '.'}}",
68 |         "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2"
69 |       }
70 |     },
71 |     // linux cpu
72 |     {
73 |       "when": "{{platform === 'linux' && (gpu !== 'amd' && gpu !=='nvidia')}}",
74 |       "method": "shell.run",
75 |       "params": {
76 |         "venv": "{{args && args.venv ? args.venv : null}}",
77 |         "path": "{{args && args.path ? args.path : '.'}}",
78 |         "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu"
79 |       }
80 |     }
81 |   ]
82 | }
83 | 


--------------------------------------------------------------------------------
/pinokio.js:
--------------------------------------------------------------------------------
  1 | const path = require('path')
  2 | module.exports = {
  3 |   version: "3.7",
  4 |   title: "FramePack",
  5 |   description: "[NVIDIA ONLY] Generate Video Progressively. FramePack is a next-frame (next-frame-section) prediction neural network structure that generates videos progressively. https://github.com/lllyasviel/FramePack",
  6 |   icon: "icon.png",
  7 |   menu: async (kernel, info) => {
  8 |     let installed = info.exists("app/env")
  9 |     let running = {
 10 |       install: info.running("install.js"),
 11 |       start: info.running("start.js"),
 12 |       start_f1: info.running("start_f1.js"),
 13 |       start_k: info.running("start_k.js"),
 14 |       update: info.running("update.js"),
 15 |       reset: info.running("reset.js"),
 16 |       link: info.running("link.js")
 17 |     }
 18 |     if (running.install) {
 19 |       return [{
 20 |         default: true,
 21 |         icon: "fa-solid fa-plug",
 22 |         text: "Installing",
 23 |         href: "install.js",
 24 |       }]
 25 |     } else if (installed) {
 26 |       if (running.start || running.start_f1 || running.start_k) {
 27 |         const script = running.start ? "start.js" : running.start_f1 ? "start_f1.js" : "start_k.js"
 28 |         let local = info.local(script)
 29 |         if (local && local.url) {
 30 |           return [{
 31 |             default: true,
 32 |             icon: "fa-solid fa-rocket",
 33 |             text: "Open Web UI",
 34 |             href: local.url,
 35 |           }, {
 36 |             icon: 'fa-solid fa-terminal',
 37 |             text: "Terminal",
 38 |             href: script,
 39 |           }]
 40 |         } else {
 41 |           return [{
 42 |             default: true,
 43 |             icon: 'fa-solid fa-terminal',
 44 |             text: "Terminal",
 45 |             href: script,
 46 |           }]
 47 |         }
 48 |       } else if (running.update) {
 49 |         return [{
 50 |           default: true,
 51 |           icon: 'fa-solid fa-terminal',
 52 |           text: "Updating",
 53 |           href: "update.js",
 54 |         }]
 55 |       } else if (running.reset) {
 56 |         return [{
 57 |           default: true,
 58 |           icon: 'fa-solid fa-terminal',
 59 |           text: "Resetting",
 60 |           href: "reset.js",
 61 |         }]
 62 |       } else if (running.link) {
 63 |         return [{
 64 |           default: true,
 65 |           icon: 'fa-solid fa-terminal',
 66 |           text: "Deduplicating",
 67 |           href: "link.js",
 68 |         }]
 69 |       } else {
 70 |         return [{
 71 |           icon: "fa-solid fa-power-off",
 72 |           text: "Start FramePack (Standard)",
 73 |           href: "start.js",
 74 |         }, {
 75 |           icon: "fa-solid fa-power-off",
 76 |           text: "Start FramePack (F1)",
 77 |           href: "start_f1.js",
 78 |         }, {
 79 |           icon: "fa-solid fa-power-off",
 80 |           text: "Start FramePack (KeyFrame)",
 81 |           href: "start_k.js",
 82 |         }, {
 83 |           icon: "fa-solid fa-plug",
 84 |           text: "Update",
 85 |           href: "update.js",
 86 |         }, {
 87 |           icon: "fa-solid fa-plug",
 88 |           text: "Install",
 89 |           href: "install.js",
 90 |         }, {
 91 |           icon: "fa-solid fa-file-zipper",
 92 |           text: "<div><strong>Save Disk Space</strong><div>Deduplicates redundant library files</div></div>",
 93 |           href: "link.js",
 94 |         }, {
 95 |           icon: "fa-regular fa-circle-xmark",
 96 |           text: "<div><strong>Reset</strong><div>Revert to pre-install state</div></div>",
 97 |           href: "reset.js",
 98 |           confirm: "Are you sure you wish to reset the app?"
 99 | 
100 |         }]
101 |       }
102 |     } else {
103 |       return [{
104 |         default: true,
105 |         icon: "fa-solid fa-plug",
106 |         text: "Install",
107 |         href: "install.js",
108 |       }]
109 |     }
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/demo_gradio_f1.py:
--------------------------------------------------------------------------------
  1 | from diffusers_helper.hf_login import login
  2 | 
  3 | import os
  4 | 
  5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
  6 | 
  7 | import gradio as gr
  8 | import torch
  9 | import traceback
 10 | import einops
 11 | import safetensors.torch as sf
 12 | import numpy as np
 13 | import argparse
 14 | import math
 15 | 
 16 | from PIL import Image
 17 | from diffusers import AutoencoderKLHunyuanVideo
 18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
 19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
 20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
 23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 24 | from diffusers_helper.thread_utils import AsyncStream, async_run
 25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
 26 | from transformers import SiglipImageProcessor, SiglipVisionModel
 27 | from diffusers_helper.clip_vision import hf_clip_vision_encode
 28 | from diffusers_helper.bucket_tools import find_nearest_bucket
 29 | 
 30 | 
 31 | parser = argparse.ArgumentParser()
 32 | parser.add_argument('--share', action='store_true')
 33 | parser.add_argument("--server", type=str, default='0.0.0.0')
 34 | parser.add_argument("--port", type=int, required=False)
 35 | parser.add_argument("--inbrowser", action='store_true')
 36 | args = parser.parse_args()
 37 | 
 38 | # for win desktop probably use --server 127.0.0.1 --inbrowser
 39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
 40 | 
 41 | print(args)
 42 | 
 43 | free_mem_gb = get_cuda_free_memory_gb(gpu)
 44 | high_vram = free_mem_gb > 60
 45 | 
 46 | print(f'Free VRAM {free_mem_gb} GB')
 47 | print(f'High-VRAM Mode: {high_vram}')
 48 | 
 49 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
 50 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
 51 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
 52 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
 53 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
 54 | 
 55 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
 56 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
 57 | 
 58 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
 59 | 
 60 | vae.eval()
 61 | text_encoder.eval()
 62 | text_encoder_2.eval()
 63 | image_encoder.eval()
 64 | transformer.eval()
 65 | 
 66 | if not high_vram:
 67 |     vae.enable_slicing()
 68 |     vae.enable_tiling()
 69 | 
 70 | transformer.high_quality_fp32_output_for_inference = True
 71 | print('transformer.high_quality_fp32_output_for_inference = True')
 72 | 
 73 | transformer.to(dtype=torch.bfloat16)
 74 | vae.to(dtype=torch.float16)
 75 | image_encoder.to(dtype=torch.float16)
 76 | text_encoder.to(dtype=torch.float16)
 77 | text_encoder_2.to(dtype=torch.float16)
 78 | 
 79 | vae.requires_grad_(False)
 80 | text_encoder.requires_grad_(False)
 81 | text_encoder_2.requires_grad_(False)
 82 | image_encoder.requires_grad_(False)
 83 | transformer.requires_grad_(False)
 84 | 
 85 | if not high_vram:
 86 |     # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
 87 |     DynamicSwapInstaller.install_model(transformer, device=gpu)
 88 |     DynamicSwapInstaller.install_model(text_encoder, device=gpu)
 89 | else:
 90 |     text_encoder.to(gpu)
 91 |     text_encoder_2.to(gpu)
 92 |     image_encoder.to(gpu)
 93 |     vae.to(gpu)
 94 |     transformer.to(gpu)
 95 | 
 96 | stream = AsyncStream()
 97 | 
 98 | outputs_folder = './outputs/'
 99 | os.makedirs(outputs_folder, exist_ok=True)
100 | 
101 | 
102 | @torch.no_grad()
103 | def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
104 |     total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
105 |     total_latent_sections = int(max(round(total_latent_sections), 1))
106 | 
107 |     job_id = generate_timestamp()
108 | 
109 |     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
110 | 
111 |     try:
112 |         # Clean GPU
113 |         if not high_vram:
114 |             unload_complete_models(
115 |                 text_encoder, text_encoder_2, image_encoder, vae, transformer
116 |             )
117 | 
118 |         # Text encoding
119 | 
120 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
121 | 
122 |         if not high_vram:
123 |             fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
124 |             load_model_as_complete(text_encoder_2, target_device=gpu)
125 | 
126 |         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
127 | 
128 |         if cfg == 1:
129 |             llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
130 |         else:
131 |             llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
132 | 
133 |         llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
134 |         llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
135 | 
136 |         # Processing input image
137 | 
138 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
139 | 
140 |         H, W, C = input_image.shape
141 |         height, width = find_nearest_bucket(H, W, resolution=640)
142 |         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
143 | 
144 |         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
145 | 
146 |         input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
147 |         input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
148 | 
149 |         # VAE encoding
150 | 
151 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
152 | 
153 |         if not high_vram:
154 |             load_model_as_complete(vae, target_device=gpu)
155 | 
156 |         start_latent = vae_encode(input_image_pt, vae)
157 | 
158 |         # CLIP Vision
159 | 
160 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
161 | 
162 |         if not high_vram:
163 |             load_model_as_complete(image_encoder, target_device=gpu)
164 | 
165 |         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
166 |         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
167 | 
168 |         # Dtype
169 | 
170 |         llama_vec = llama_vec.to(transformer.dtype)
171 |         llama_vec_n = llama_vec_n.to(transformer.dtype)
172 |         clip_l_pooler = clip_l_pooler.to(transformer.dtype)
173 |         clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
174 |         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
175 | 
176 |         # Sampling
177 | 
178 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
179 | 
180 |         rnd = torch.Generator("cpu").manual_seed(seed)
181 | 
182 |         history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
183 |         history_pixels = None
184 | 
185 |         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
186 |         total_generated_latent_frames = 1
187 | 
188 |         for section_index in range(total_latent_sections):
189 |             if stream.input_queue.top() == 'end':
190 |                 stream.output_queue.push(('end', None))
191 |                 return
192 | 
193 |             print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
194 | 
195 |             if not high_vram:
196 |                 unload_complete_models()
197 |                 move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
198 | 
199 |             if use_teacache:
200 |                 transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
201 |             else:
202 |                 transformer.initialize_teacache(enable_teacache=False)
203 | 
204 |             def callback(d):
205 |                 preview = d['denoised']
206 |                 preview = vae_decode_fake(preview)
207 | 
208 |                 preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
209 |                 preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
210 | 
211 |                 if stream.input_queue.top() == 'end':
212 |                     stream.output_queue.push(('end', None))
213 |                     raise KeyboardInterrupt('User ends the task.')
214 | 
215 |                 current_step = d['i'] + 1
216 |                 percentage = int(100.0 * current_step / steps)
217 |                 hint = f'Sampling {current_step}/{steps}'
218 |                 desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
219 |                 stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
220 |                 return
221 | 
222 |             indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
223 |             clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
224 |             clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
225 | 
226 |             clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
227 |             clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
228 | 
229 |             generated_latents = sample_hunyuan(
230 |                 transformer=transformer,
231 |                 sampler='unipc',
232 |                 width=width,
233 |                 height=height,
234 |                 frames=latent_window_size * 4 - 3,
235 |                 real_guidance_scale=cfg,
236 |                 distilled_guidance_scale=gs,
237 |                 guidance_rescale=rs,
238 |                 # shift=3.0,
239 |                 num_inference_steps=steps,
240 |                 generator=rnd,
241 |                 prompt_embeds=llama_vec,
242 |                 prompt_embeds_mask=llama_attention_mask,
243 |                 prompt_poolers=clip_l_pooler,
244 |                 negative_prompt_embeds=llama_vec_n,
245 |                 negative_prompt_embeds_mask=llama_attention_mask_n,
246 |                 negative_prompt_poolers=clip_l_pooler_n,
247 |                 device=gpu,
248 |                 dtype=torch.bfloat16,
249 |                 image_embeddings=image_encoder_last_hidden_state,
250 |                 latent_indices=latent_indices,
251 |                 clean_latents=clean_latents,
252 |                 clean_latent_indices=clean_latent_indices,
253 |                 clean_latents_2x=clean_latents_2x,
254 |                 clean_latent_2x_indices=clean_latent_2x_indices,
255 |                 clean_latents_4x=clean_latents_4x,
256 |                 clean_latent_4x_indices=clean_latent_4x_indices,
257 |                 callback=callback,
258 |             )
259 | 
260 |             total_generated_latent_frames += int(generated_latents.shape[2])
261 |             history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
262 | 
263 |             if not high_vram:
264 |                 offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
265 |                 load_model_as_complete(vae, target_device=gpu)
266 | 
267 |             real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
268 | 
269 |             if history_pixels is None:
270 |                 history_pixels = vae_decode(real_history_latents, vae).cpu()
271 |             else:
272 |                 section_latent_frames = latent_window_size * 2
273 |                 overlapped_frames = latent_window_size * 4 - 3
274 | 
275 |                 current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
276 |                 history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
277 | 
278 |             if not high_vram:
279 |                 unload_complete_models()
280 | 
281 |             output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
282 | 
283 |             save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
284 | 
285 |             print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
286 | 
287 |             stream.output_queue.push(('file', output_filename))
288 |     except:
289 |         traceback.print_exc()
290 | 
291 |         if not high_vram:
292 |             unload_complete_models(
293 |                 text_encoder, text_encoder_2, image_encoder, vae, transformer
294 |             )
295 | 
296 |     stream.output_queue.push(('end', None))
297 |     return
298 | 
299 | 
300 | def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
301 |     global stream
302 |     assert input_image is not None, 'No input image!'
303 | 
304 |     yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
305 | 
306 |     stream = AsyncStream()
307 | 
308 |     async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
309 | 
310 |     output_filename = None
311 | 
312 |     while True:
313 |         flag, data = stream.output_queue.next()
314 | 
315 |         if flag == 'file':
316 |             output_filename = data
317 |             yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
318 | 
319 |         if flag == 'progress':
320 |             preview, desc, html = data
321 |             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
322 | 
323 |         if flag == 'end':
324 |             yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
325 |             break
326 | 
327 | 
328 | def end_process():
329 |     stream.input_queue.push('end')
330 | 
331 | 
332 | quick_prompts = [
333 |     'The girl dances gracefully, with clear movements, full of charm.',
334 |     'A character doing some simple body movements.',
335 | ]
336 | quick_prompts = [[x] for x in quick_prompts]
337 | 
338 | 
339 | css = make_progress_bar_css()
340 | block = gr.Blocks(css=css).queue()
341 | with block:
342 |     gr.Markdown('# FramePack-F1')
343 |     with gr.Row():
344 |         with gr.Column():
345 |             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
346 |             prompt = gr.Textbox(label="Prompt", value='')
347 |             example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
348 |             example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
349 | 
350 |             with gr.Row():
351 |                 start_button = gr.Button(value="Start Generation")
352 |                 end_button = gr.Button(value="End Generation", interactive=False)
353 | 
354 |             with gr.Group():
355 |                 use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
356 | 
357 |                 n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)  # Not used
358 |                 seed = gr.Number(label="Seed", value=31337, precision=0)
359 | 
360 |                 total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
361 |                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False)  # Should not change
362 |                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
363 | 
364 |                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False)  # Should not change
365 |                 gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
366 |                 rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False)  # Should not change
367 | 
368 |                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
369 | 
370 |                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
371 | 
372 |         with gr.Column():
373 |             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
374 |             result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
375 |             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
376 |             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
377 | 
378 |     gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
379 | 
380 |     ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
381 |     start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
382 |     end_button.click(fn=end_process)
383 | 
384 | 
385 | block.launch(
386 |     server_name=args.server,
387 |     server_port=args.port,
388 |     share=args.share,
389 |     inbrowser=args.inbrowser,
390 | )


--------------------------------------------------------------------------------
/demo_gradio_k.py:
--------------------------------------------------------------------------------
  1 | from diffusers_helper.hf_login import login
  2 | 
  3 | import os
  4 | 
  5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
  6 | 
  7 | import gradio as gr
  8 | import torch
  9 | import traceback
 10 | import einops
 11 | import safetensors.torch as sf
 12 | import numpy as np
 13 | import argparse
 14 | import math
 15 | 
 16 | from PIL import Image
 17 | from diffusers import AutoencoderKLHunyuanVideo
 18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
 19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
 20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
 23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 24 | from diffusers_helper.thread_utils import AsyncStream, async_run
 25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
 26 | from transformers import SiglipImageProcessor, SiglipVisionModel
 27 | from diffusers_helper.clip_vision import hf_clip_vision_encode
 28 | from diffusers_helper.bucket_tools import find_nearest_bucket
 29 | 
 30 | 
 31 | parser = argparse.ArgumentParser()
 32 | parser.add_argument('--share', action='store_true')
 33 | parser.add_argument("--server", type=str, default='0.0.0.0')
 34 | parser.add_argument("--port", type=int, required=False)
 35 | parser.add_argument("--inbrowser", action='store_true')
 36 | args = parser.parse_args()
 37 | 
 38 | # for win desktop probably use --server 127.0.0.1 --inbrowser
 39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
 40 | print(args)
 41 | 
 42 | free_mem_gb = get_cuda_free_memory_gb(gpu)
 43 | high_vram = free_mem_gb > 60
 44 | 
 45 | print(f'Free VRAM {free_mem_gb} GB')
 46 | print(f'High-VRAM Mode: {high_vram}')
 47 | 
 48 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
 49 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
 50 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
 51 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
 52 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
 53 | 
 54 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
 55 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
 56 | 
 57 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
 58 | 
 59 | vae.eval()
 60 | text_encoder.eval()
 61 | text_encoder_2.eval()
 62 | image_encoder.eval()
 63 | transformer.eval()
 64 | 
 65 | if not high_vram:
 66 |     vae.enable_slicing()
 67 |     vae.enable_tiling()
 68 | 
 69 | transformer.high_quality_fp32_output_for_inference = True
 70 | print('transformer.high_quality_fp32_output_for_inference = True')
 71 | 
 72 | transformer.to(dtype=torch.bfloat16)
 73 | vae.to(dtype=torch.float16)
 74 | image_encoder.to(dtype=torch.float16)
 75 | text_encoder.to(dtype=torch.float16)
 76 | text_encoder_2.to(dtype=torch.float16)
 77 | 
 78 | vae.requires_grad_(False)
 79 | text_encoder.requires_grad_(False)
 80 | text_encoder_2.requires_grad_(False)
 81 | image_encoder.requires_grad_(False)
 82 | transformer.requires_grad_(False)
 83 | 
 84 | if not high_vram:
 85 |     # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
 86 |     DynamicSwapInstaller.install_model(transformer, device=gpu)
 87 |     DynamicSwapInstaller.install_model(text_encoder, device=gpu)
 88 | else:
 89 |     text_encoder.to(gpu)
 90 |     text_encoder_2.to(gpu)
 91 |     image_encoder.to(gpu)
 92 |     vae.to(gpu)
 93 |     transformer.to(gpu)
 94 | 
 95 | stream = AsyncStream()
 96 | 
 97 | outputs_folder = './outputs/'
 98 | os.makedirs(outputs_folder, exist_ok=True)
 99 | 
100 | 
101 | @torch.no_grad()
102 | def worker(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
103 |     total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
104 |     total_latent_sections = int(max(round(total_latent_sections), 1))
105 | 
106 |     job_id = generate_timestamp()
107 | 
108 |     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
109 | 
110 |     try:
111 |         # Clean GPU
112 |         if not high_vram:
113 |             unload_complete_models(
114 |                 text_encoder, text_encoder_2, image_encoder, vae, transformer
115 |             )
116 | 
117 |         # Text encoding
118 | 
119 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
120 | 
121 |         if not high_vram:
122 |             fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
123 |             load_model_as_complete(text_encoder_2, target_device=gpu)
124 | 
125 |         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
126 | 
127 |         if cfg == 1:
128 |             llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
129 |         else:
130 |             llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
131 | 
132 |         llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
133 |         llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
134 | 
135 |         # Processing input image (start frame)
136 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...'))))
137 | 
138 |         H, W, C = input_image.shape
139 |         height, width = find_nearest_bucket(H, W, resolution=640)
140 |         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
141 | 
142 |         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_start.png'))
143 | 
144 |         input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
145 |         input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
146 |         
147 |         # Processing end image (if provided)
148 |         has_end_image = end_image is not None
149 |         if has_end_image:
150 |             stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...'))))
151 |             
152 |             H_end, W_end, C_end = end_image.shape
153 |             end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
154 |             
155 |             Image.fromarray(end_image_np).save(os.path.join(outputs_folder, f'{job_id}_end.png'))
156 |             
157 |             end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
158 |             end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
159 | 
160 |         # VAE encoding
161 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
162 | 
163 |         if not high_vram:
164 |             load_model_as_complete(vae, target_device=gpu)
165 | 
166 |         start_latent = vae_encode(input_image_pt, vae)
167 |         
168 |         if has_end_image:
169 |             end_latent = vae_encode(end_image_pt, vae)
170 | 
171 |         # CLIP Vision
172 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
173 | 
174 |         if not high_vram:
175 |             load_model_as_complete(image_encoder, target_device=gpu)
176 | 
177 |         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
178 |         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
179 |         
180 |         if has_end_image:
181 |             end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder)
182 |             end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
183 |             # Combine both image embeddings or use a weighted approach
184 |             image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
185 | 
186 |         # Dtype
187 |         llama_vec = llama_vec.to(transformer.dtype)
188 |         llama_vec_n = llama_vec_n.to(transformer.dtype)
189 |         clip_l_pooler = clip_l_pooler.to(transformer.dtype)
190 |         clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
191 |         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
192 | 
193 |         # Sampling
194 |         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
195 | 
196 |         rnd = torch.Generator("cpu").manual_seed(seed)
197 |         num_frames = latent_window_size * 4 - 3
198 | 
199 |         history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
200 |         history_pixels = None
201 |         total_generated_latent_frames = 0
202 | 
203 |         # 将迭代器转换为列表
204 |         latent_paddings = list(reversed(range(total_latent_sections)))
205 | 
206 |         if total_latent_sections > 4:
207 |             # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
208 |             # items looks better than expanding it when total_latent_sections > 4
209 |             # One can try to remove below trick and just
210 |             # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
211 |             latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
212 | 
213 |         for latent_padding in latent_paddings:
214 |             is_last_section = latent_padding == 0
215 |             is_first_section = latent_padding == latent_paddings[0]
216 |             latent_padding_size = latent_padding * latent_window_size
217 | 
218 |             if stream.input_queue.top() == 'end':
219 |                 stream.output_queue.push(('end', None))
220 |                 return
221 | 
222 |             print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
223 | 
224 |             indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
225 |             clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
226 |             clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
227 | 
228 |             clean_latents_pre = start_latent.to(history_latents)
229 |             clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
230 |             clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
231 |             
232 |             # Use end image latent for the first section if provided
233 |             if has_end_image and is_first_section:
234 |                 clean_latents_post = end_latent.to(history_latents)
235 |                 clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
236 | 
237 |             if not high_vram:
238 |                 unload_complete_models()
239 |                 move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
240 | 
241 |             if use_teacache:
242 |                 transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
243 |             else:
244 |                 transformer.initialize_teacache(enable_teacache=False)
245 | 
246 |             def callback(d):
247 |                 preview = d['denoised']
248 |                 preview = vae_decode_fake(preview)
249 | 
250 |                 preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
251 |                 preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
252 | 
253 |                 if stream.input_queue.top() == 'end':
254 |                     stream.output_queue.push(('end', None))
255 |                     raise KeyboardInterrupt('User ends the task.')
256 | 
257 |                 current_step = d['i'] + 1
258 |                 percentage = int(100.0 * current_step / steps)
259 |                 hint = f'Sampling {current_step}/{steps}'
260 |                 desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
261 |                 stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
262 |                 return
263 | 
264 |             generated_latents = sample_hunyuan(
265 |                 transformer=transformer,
266 |                 sampler='unipc',
267 |                 width=width,
268 |                 height=height,
269 |                 frames=num_frames,
270 |                 real_guidance_scale=cfg,
271 |                 distilled_guidance_scale=gs,
272 |                 guidance_rescale=rs,
273 |                 # shift=3.0,
274 |                 num_inference_steps=steps,
275 |                 generator=rnd,
276 |                 prompt_embeds=llama_vec,
277 |                 prompt_embeds_mask=llama_attention_mask,
278 |                 prompt_poolers=clip_l_pooler,
279 |                 negative_prompt_embeds=llama_vec_n,
280 |                 negative_prompt_embeds_mask=llama_attention_mask_n,
281 |                 negative_prompt_poolers=clip_l_pooler_n,
282 |                 device=gpu,
283 |                 dtype=torch.bfloat16,
284 |                 image_embeddings=image_encoder_last_hidden_state,
285 |                 latent_indices=latent_indices,
286 |                 clean_latents=clean_latents,
287 |                 clean_latent_indices=clean_latent_indices,
288 |                 clean_latents_2x=clean_latents_2x,
289 |                 clean_latent_2x_indices=clean_latent_2x_indices,
290 |                 clean_latents_4x=clean_latents_4x,
291 |                 clean_latent_4x_indices=clean_latent_4x_indices,
292 |                 callback=callback,
293 |             )
294 | 
295 |             if is_last_section:
296 |                 generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
297 | 
298 |             total_generated_latent_frames += int(generated_latents.shape[2])
299 |             history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
300 | 
301 |             if not high_vram:
302 |                 offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
303 |                 load_model_as_complete(vae, target_device=gpu)
304 | 
305 |             real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
306 | 
307 |             if history_pixels is None:
308 |                 history_pixels = vae_decode(real_history_latents, vae).cpu()
309 |             else:
310 |                 section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
311 |                 overlapped_frames = latent_window_size * 4 - 3
312 | 
313 |                 current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
314 |                 history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
315 | 
316 |             if not high_vram:
317 |                 unload_complete_models()
318 | 
319 |             output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
320 | 
321 |             save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
322 | 
323 |             print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
324 | 
325 |             stream.output_queue.push(('file', output_filename))
326 | 
327 |             if is_last_section:
328 |                 break
329 |     except:
330 |         traceback.print_exc()
331 | 
332 |         if not high_vram:
333 |             unload_complete_models(
334 |                 text_encoder, text_encoder_2, image_encoder, vae, transformer
335 |             )
336 | 
337 |     stream.output_queue.push(('end', None))
338 |     return
339 | 
340 | 
341 | def process(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
342 |     global stream
343 |     assert input_image is not None, 'No input image!'
344 | 
345 |     yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
346 | 
347 |     stream = AsyncStream()
348 | 
349 |     async_run(worker, input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
350 | 
351 |     output_filename = None
352 | 
353 |     while True:
354 |         flag, data = stream.output_queue.next()
355 | 
356 |         if flag == 'file':
357 |             output_filename = data
358 |             yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
359 | 
360 |         if flag == 'progress':
361 |             preview, desc, html = data
362 |             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
363 | 
364 |         if flag == 'end':
365 |             yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
366 |             break
367 | 
368 | 
369 | def end_process():
370 |     stream.input_queue.push('end')
371 | 
372 | 
373 | quick_prompts = [
374 |     'The girl dances gracefully, with clear movements, full of charm.',
375 |     'A character doing some simple body movements.',
376 | ]
377 | quick_prompts = [[x] for x in quick_prompts]
378 | 
379 | 
380 | css = make_progress_bar_css()
381 | block = gr.Blocks(css=css).queue()
382 | with block:
383 |     gr.Markdown('# FramePack')
384 |     with gr.Row():
385 |         with gr.Column():
386 |             with gr.Row():
387 |                 with gr.Column():
388 |                     input_image = gr.Image(sources='upload', type="numpy", label="Start Frame", height=320)
389 |                 with gr.Column():
390 |                     end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
391 |             
392 |             prompt = gr.Textbox(label="Prompt", value='')
393 |             example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
394 |             example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
395 | 
396 |             with gr.Row():
397 |                 start_button = gr.Button(value="Start Generation")
398 |                 end_button = gr.Button(value="End Generation", interactive=False)
399 | 
400 |             with gr.Group():
401 |                 use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
402 | 
403 |                 n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)  # Not used
404 |                 seed = gr.Number(label="Seed", value=31337, precision=0)
405 | 
406 |                 total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
407 |                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False)  # Should not change
408 |                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
409 | 
410 |                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False)  # Should not change
411 |                 gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
412 |                 rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False)  # Should not change
413 | 
414 |                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
415 |                 
416 |                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
417 | 
418 |         with gr.Column():
419 |             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
420 |             result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
421 |             gr.Markdown('When using only a start frame, the ending actions will be generated before the starting actions due to the inverted sampling. If using both start and end frames, the model will try to create a smooth transition between them.')
422 |             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
423 |             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
424 | 
425 |     gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
426 |     
427 |     ips = [input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
428 |     start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
429 |     end_button.click(fn=end_process)
430 | 
431 | 
432 | block.launch(
433 |     server_name=args.server,
434 |     server_port=args.port,
435 |     share=args.share,
436 |     inbrowser=args.inbrowser,
437 | )
438 | 


--------------------------------------------------------------------------------