├── icon.png ├── README.md ├── reset.js ├── link.js ├── update.js ├── start.js ├── start_f1.js ├── start_k.js ├── pinokio_meta.json ├── install.js ├── torch.js ├── pinokio.js ├── demo_gradio_f1.py └── demo_gradio_k.py /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinokiofactory/Frame-Pack/HEAD/icon.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FramePack 2 | 3 | A pinokio script for https://github.com/lllyasviel/FramePack 4 | 5 | -------------------------------------------------------------------------------- /reset.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | run: [{ 3 | method: "fs.rm", 4 | params: { 5 | path: "app" 6 | } 7 | }] 8 | } 9 | -------------------------------------------------------------------------------- /link.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | run: [ 3 | { 4 | method: "fs.link", 5 | params: { 6 | venv: "app/env" 7 | } 8 | } 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /update.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | run: [{ 3 | method: "shell.run", 4 | params: { 5 | message: "git pull" 6 | } 7 | }, { 8 | method: "shell.run", 9 | params: { 10 | path: "app", 11 | message: "git pull" 12 | } 13 | }] 14 | } 15 | -------------------------------------------------------------------------------- /start.js: -------------------------------------------------------------------------------- 1 | module.exports = async (kernel) => { 2 | const port = await kernel.port() 3 | 4 | console.log("Starting standard version directly") 5 | const scriptToRun = "demo_gradio.py" 6 | 7 | return { 8 | requires: { 9 | bundle: "ai", 10 | }, 11 | daemon: true, 12 | run: [ 13 | { 14 | method: "shell.run", 15 | params: { 16 | venv: "env", 17 | env: { }, 18 | path: "app", 19 | message: [ 20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`, 21 | ], 22 | on: [{ 23 | "event": "/http:\/\/[0-9.:]+/", 24 | "done": true 25 | }] 26 | } 27 | }, 28 | { 29 | method: "local.set", 30 | params: { 31 | url: "{{input.event[0]}}" 32 | } 33 | } 34 | ] 35 | }; 36 | }; 37 | -------------------------------------------------------------------------------- /start_f1.js: -------------------------------------------------------------------------------- 1 | module.exports = async (kernel) => { 2 | const port = await kernel.port() 3 | 4 | console.log("Starting F1 version directly") 5 | const scriptToRun = "demo_gradio_f1.py" 6 | 7 | return { 8 | requires: { 9 | bundle: "ai", 10 | }, 11 | daemon: true, 12 | run: [ 13 | { 14 | method: "shell.run", 15 | params: { 16 | venv: "env", 17 | env: { }, 18 | path: "app", 19 | message: [ 20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`, 21 | ], 22 | on: [{ 23 | "event": "/http:\/\/\\S+/", 24 | "done": true 25 | }] 26 | } 27 | }, 28 | { 29 | method: "local.set", 30 | params: { 31 | url: "{{input.event[0]}}" 32 | } 33 | } 34 | ] 35 | }; 36 | 37 | }; 38 | -------------------------------------------------------------------------------- /start_k.js: -------------------------------------------------------------------------------- 1 | module.exports = async (kernel) => { 2 | const port = await kernel.port() 3 | 4 | console.log("Starting standard version directly") 5 | const scriptToRun = "demo_gradio_k.py" 6 | 7 | return { 8 | requires: { 9 | bundle: "ai", 10 | }, 11 | daemon: true, 12 | run: [ 13 | { 14 | method: "shell.run", 15 | params: { 16 | venv: "env", 17 | env: { }, 18 | path: "app", 19 | message: [ 20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`, 21 | ], 22 | on: [{ 23 | "event": "/http:\/\/[0-9.:]+/", 24 | "done": true 25 | }] 26 | } 27 | }, 28 | { 29 | method: "local.set", 30 | params: { 31 | url: "{{input.event[0]}}" 32 | } 33 | } 34 | ] 35 | }; 36 | }; 37 | -------------------------------------------------------------------------------- /pinokio_meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "posts": [ 3 | "https://x.com/cocktailpeanut/status/1913004350970159464", 4 | "https://x.com/cocktailpeanut/status/1912977698642907614", 5 | "https://x.com/joesparks/status/1912986750282272861", 6 | "https://x.com/matze2001/status/1912989768406679901", 7 | "https://x.com/cocktailpeanut/status/1912998525274853592", 8 | "https://x.com/cocktailpeanut/status/1913003111050018864", 9 | "https://x.com/SUP3RMASS1VE/status/1912837447525822592", 10 | "https://x.com/SUP3RMASS1VE/status/1912842398654210420", 11 | "https://x.com/SUP3RMASS1VE/status/1912975317896921253" 12 | ], 13 | "links": [{ 14 | "title": "SUP3RMASS1VE (Wrote the launcher)", 15 | "links": [{ 16 | "type": "bitcoin", 17 | "value": "1N942jHr6vVuR2KAe2JEf3nN59eR21tpKv" 18 | }, { 19 | "title": "X", 20 | "value": "https://x.com/SUP3RMASS1VE" 21 | }, { 22 | "title": "Github", 23 | "value": "https://github.com/SUP3RMASS1VE" 24 | }, { 25 | "title": "Discord", 26 | "value": "https://discord.gg/mvDcrA57AQ" 27 | }] 28 | }] 29 | } 30 | -------------------------------------------------------------------------------- /install.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | requires: { 3 | bundle: "ai", 4 | }, 5 | run: [ 6 | { 7 | method: "shell.run", 8 | params: { 9 | message: [ 10 | "git clone https://github.com/lllyasviel/FramePack app", 11 | ] 12 | } 13 | }, 14 | { 15 | when: "{{platform === 'win32'}}", 16 | method: "shell.run", 17 | params: { 18 | message: [ 19 | "copy /Y demo_gradio_k.py app", 20 | "copy /Y demo_gradio_f1.py app" 21 | ] 22 | }, 23 | }, 24 | { 25 | when: "{{platform !== 'win32'}}", 26 | method: "shell.run", 27 | params: { 28 | message: [ 29 | "cp -f demo_gradio_k.py app", 30 | "cp -f demo_gradio_f1.py app" 31 | ] 32 | }, 33 | }, 34 | { 35 | method: "script.start", 36 | params: { 37 | uri: "torch.js", 38 | params: { 39 | venv: "env", 40 | path: "app", 41 | // xformers: true, 42 | // triton: true, 43 | // sageattention: true 44 | } 45 | } 46 | }, 47 | { 48 | method: "shell.run", 49 | params: { 50 | venv: "env", 51 | path: "app", 52 | message: [ 53 | "uv pip install gradio devicetorch", 54 | "uv pip install -r requirements.txt", 55 | "uv pip install hf_xet" 56 | ] 57 | } 58 | }, 59 | { 60 | method: 'input', 61 | params: { 62 | title: 'Installation completed', 63 | description: 'Click "Start FramePack (Standard)", "Start FramePack (F1)" or "Start FramePack (Key Frame)" in the left menu to launch the app manually' 64 | } 65 | }, 66 | ] 67 | } 68 | -------------------------------------------------------------------------------- /torch.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | run: [ 3 | // windows nvidia 4 | { 5 | "when": "{{platform === 'win32' && gpu === 'nvidia'}}", 6 | "method": "shell.run", 7 | "params": { 8 | "venv": "{{args && args.venv ? args.venv : null}}", 9 | "path": "{{args && args.path ? args.path : '.'}}", 10 | "message": [ 11 | "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps", 12 | "uv pip install triton-windows==3.3.1.post19", 13 | "uv pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl", 14 | ] 15 | } 16 | }, 17 | // windows amd 18 | { 19 | "when": "{{platform === 'win32' && gpu === 'amd'}}", 20 | "method": "shell.run", 21 | "params": { 22 | "venv": "{{args && args.venv ? args.venv : null}}", 23 | "path": "{{args && args.path ? args.path : '.'}}", 24 | "message": "uv pip install torch-directml torchaudio torchvision numpy==1.26.4" 25 | } 26 | }, 27 | // windows cpu 28 | { 29 | "when": "{{platform === 'win32' && (gpu !== 'nvidia' && gpu !== 'amd')}}", 30 | "method": "shell.run", 31 | "params": { 32 | "venv": "{{args && args.venv ? args.venv : null}}", 33 | "path": "{{args && args.path ? args.path : '.'}}", 34 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 numpy==1.26.4" 35 | } 36 | }, 37 | // mac 38 | { 39 | "when": "{{platform === 'darwin'}}", 40 | "method": "shell.run", 41 | "params": { 42 | "venv": "{{args && args.venv ? args.venv : null}}", 43 | "path": "{{args && args.path ? args.path : '.'}}", 44 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1" 45 | } 46 | }, 47 | // linux nvidia 48 | { 49 | "when": "{{platform === 'linux' && gpu === 'nvidia'}}", 50 | "method": "shell.run", 51 | "params": { 52 | "venv": "{{args && args.venv ? args.venv : null}}", 53 | "path": "{{args && args.path ? args.path : '.'}}", 54 | "message": [ 55 | "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps", 56 | "uv pip install triton", 57 | "uv pip install git+https://github.com/thu-ml/SageAttention.git", 58 | ] 59 | } 60 | }, 61 | // linux rocm (amd) 62 | { 63 | "when": "{{platform === 'linux' && gpu === 'amd'}}", 64 | "method": "shell.run", 65 | "params": { 66 | "venv": "{{args && args.venv ? args.venv : null}}", 67 | "path": "{{args && args.path ? args.path : '.'}}", 68 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2" 69 | } 70 | }, 71 | // linux cpu 72 | { 73 | "when": "{{platform === 'linux' && (gpu !== 'amd' && gpu !=='nvidia')}}", 74 | "method": "shell.run", 75 | "params": { 76 | "venv": "{{args && args.venv ? args.venv : null}}", 77 | "path": "{{args && args.path ? args.path : '.'}}", 78 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu" 79 | } 80 | } 81 | ] 82 | } 83 | -------------------------------------------------------------------------------- /pinokio.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | module.exports = { 3 | version: "3.7", 4 | title: "FramePack", 5 | description: "[NVIDIA ONLY] Generate Video Progressively. FramePack is a next-frame (next-frame-section) prediction neural network structure that generates videos progressively. https://github.com/lllyasviel/FramePack", 6 | icon: "icon.png", 7 | menu: async (kernel, info) => { 8 | let installed = info.exists("app/env") 9 | let running = { 10 | install: info.running("install.js"), 11 | start: info.running("start.js"), 12 | start_f1: info.running("start_f1.js"), 13 | start_k: info.running("start_k.js"), 14 | update: info.running("update.js"), 15 | reset: info.running("reset.js"), 16 | link: info.running("link.js") 17 | } 18 | if (running.install) { 19 | return [{ 20 | default: true, 21 | icon: "fa-solid fa-plug", 22 | text: "Installing", 23 | href: "install.js", 24 | }] 25 | } else if (installed) { 26 | if (running.start || running.start_f1 || running.start_k) { 27 | const script = running.start ? "start.js" : running.start_f1 ? "start_f1.js" : "start_k.js" 28 | let local = info.local(script) 29 | if (local && local.url) { 30 | return [{ 31 | default: true, 32 | icon: "fa-solid fa-rocket", 33 | text: "Open Web UI", 34 | href: local.url, 35 | }, { 36 | icon: 'fa-solid fa-terminal', 37 | text: "Terminal", 38 | href: script, 39 | }] 40 | } else { 41 | return [{ 42 | default: true, 43 | icon: 'fa-solid fa-terminal', 44 | text: "Terminal", 45 | href: script, 46 | }] 47 | } 48 | } else if (running.update) { 49 | return [{ 50 | default: true, 51 | icon: 'fa-solid fa-terminal', 52 | text: "Updating", 53 | href: "update.js", 54 | }] 55 | } else if (running.reset) { 56 | return [{ 57 | default: true, 58 | icon: 'fa-solid fa-terminal', 59 | text: "Resetting", 60 | href: "reset.js", 61 | }] 62 | } else if (running.link) { 63 | return [{ 64 | default: true, 65 | icon: 'fa-solid fa-terminal', 66 | text: "Deduplicating", 67 | href: "link.js", 68 | }] 69 | } else { 70 | return [{ 71 | icon: "fa-solid fa-power-off", 72 | text: "Start FramePack (Standard)", 73 | href: "start.js", 74 | }, { 75 | icon: "fa-solid fa-power-off", 76 | text: "Start FramePack (F1)", 77 | href: "start_f1.js", 78 | }, { 79 | icon: "fa-solid fa-power-off", 80 | text: "Start FramePack (KeyFrame)", 81 | href: "start_k.js", 82 | }, { 83 | icon: "fa-solid fa-plug", 84 | text: "Update", 85 | href: "update.js", 86 | }, { 87 | icon: "fa-solid fa-plug", 88 | text: "Install", 89 | href: "install.js", 90 | }, { 91 | icon: "fa-solid fa-file-zipper", 92 | text: "
Save Disk Space
Deduplicates redundant library files
", 93 | href: "link.js", 94 | }, { 95 | icon: "fa-regular fa-circle-xmark", 96 | text: "
Reset
Revert to pre-install state
", 97 | href: "reset.js", 98 | confirm: "Are you sure you wish to reset the app?" 99 | 100 | }] 101 | } 102 | } else { 103 | return [{ 104 | default: true, 105 | icon: "fa-solid fa-plug", 106 | text: "Install", 107 | href: "install.js", 108 | }] 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /demo_gradio_f1.py: -------------------------------------------------------------------------------- 1 | from diffusers_helper.hf_login import login 2 | 3 | import os 4 | 5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))) 6 | 7 | import gradio as gr 8 | import torch 9 | import traceback 10 | import einops 11 | import safetensors.torch as sf 12 | import numpy as np 13 | import argparse 14 | import math 15 | 16 | from PIL import Image 17 | from diffusers import AutoencoderKLHunyuanVideo 18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer 19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake 20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp 21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked 22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan 23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete 24 | from diffusers_helper.thread_utils import AsyncStream, async_run 25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html 26 | from transformers import SiglipImageProcessor, SiglipVisionModel 27 | from diffusers_helper.clip_vision import hf_clip_vision_encode 28 | from diffusers_helper.bucket_tools import find_nearest_bucket 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--share', action='store_true') 33 | parser.add_argument("--server", type=str, default='0.0.0.0') 34 | parser.add_argument("--port", type=int, required=False) 35 | parser.add_argument("--inbrowser", action='store_true') 36 | args = parser.parse_args() 37 | 38 | # for win desktop probably use --server 127.0.0.1 --inbrowser 39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags 40 | 41 | print(args) 42 | 43 | free_mem_gb = get_cuda_free_memory_gb(gpu) 44 | high_vram = free_mem_gb > 60 45 | 46 | print(f'Free VRAM {free_mem_gb} GB') 47 | print(f'High-VRAM Mode: {high_vram}') 48 | 49 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() 50 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() 51 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') 52 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') 53 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() 54 | 55 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') 56 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() 57 | 58 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu() 59 | 60 | vae.eval() 61 | text_encoder.eval() 62 | text_encoder_2.eval() 63 | image_encoder.eval() 64 | transformer.eval() 65 | 66 | if not high_vram: 67 | vae.enable_slicing() 68 | vae.enable_tiling() 69 | 70 | transformer.high_quality_fp32_output_for_inference = True 71 | print('transformer.high_quality_fp32_output_for_inference = True') 72 | 73 | transformer.to(dtype=torch.bfloat16) 74 | vae.to(dtype=torch.float16) 75 | image_encoder.to(dtype=torch.float16) 76 | text_encoder.to(dtype=torch.float16) 77 | text_encoder_2.to(dtype=torch.float16) 78 | 79 | vae.requires_grad_(False) 80 | text_encoder.requires_grad_(False) 81 | text_encoder_2.requires_grad_(False) 82 | image_encoder.requires_grad_(False) 83 | transformer.requires_grad_(False) 84 | 85 | if not high_vram: 86 | # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster 87 | DynamicSwapInstaller.install_model(transformer, device=gpu) 88 | DynamicSwapInstaller.install_model(text_encoder, device=gpu) 89 | else: 90 | text_encoder.to(gpu) 91 | text_encoder_2.to(gpu) 92 | image_encoder.to(gpu) 93 | vae.to(gpu) 94 | transformer.to(gpu) 95 | 96 | stream = AsyncStream() 97 | 98 | outputs_folder = './outputs/' 99 | os.makedirs(outputs_folder, exist_ok=True) 100 | 101 | 102 | @torch.no_grad() 103 | def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): 104 | total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) 105 | total_latent_sections = int(max(round(total_latent_sections), 1)) 106 | 107 | job_id = generate_timestamp() 108 | 109 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...')))) 110 | 111 | try: 112 | # Clean GPU 113 | if not high_vram: 114 | unload_complete_models( 115 | text_encoder, text_encoder_2, image_encoder, vae, transformer 116 | ) 117 | 118 | # Text encoding 119 | 120 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))) 121 | 122 | if not high_vram: 123 | fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. 124 | load_model_as_complete(text_encoder_2, target_device=gpu) 125 | 126 | llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) 127 | 128 | if cfg == 1: 129 | llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler) 130 | else: 131 | llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) 132 | 133 | llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512) 134 | llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512) 135 | 136 | # Processing input image 137 | 138 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))) 139 | 140 | H, W, C = input_image.shape 141 | height, width = find_nearest_bucket(H, W, resolution=640) 142 | input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) 143 | 144 | Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png')) 145 | 146 | input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1 147 | input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None] 148 | 149 | # VAE encoding 150 | 151 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...')))) 152 | 153 | if not high_vram: 154 | load_model_as_complete(vae, target_device=gpu) 155 | 156 | start_latent = vae_encode(input_image_pt, vae) 157 | 158 | # CLIP Vision 159 | 160 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...')))) 161 | 162 | if not high_vram: 163 | load_model_as_complete(image_encoder, target_device=gpu) 164 | 165 | image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder) 166 | image_encoder_last_hidden_state = image_encoder_output.last_hidden_state 167 | 168 | # Dtype 169 | 170 | llama_vec = llama_vec.to(transformer.dtype) 171 | llama_vec_n = llama_vec_n.to(transformer.dtype) 172 | clip_l_pooler = clip_l_pooler.to(transformer.dtype) 173 | clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype) 174 | image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype) 175 | 176 | # Sampling 177 | 178 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...')))) 179 | 180 | rnd = torch.Generator("cpu").manual_seed(seed) 181 | 182 | history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu() 183 | history_pixels = None 184 | 185 | history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2) 186 | total_generated_latent_frames = 1 187 | 188 | for section_index in range(total_latent_sections): 189 | if stream.input_queue.top() == 'end': 190 | stream.output_queue.push(('end', None)) 191 | return 192 | 193 | print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}') 194 | 195 | if not high_vram: 196 | unload_complete_models() 197 | move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation) 198 | 199 | if use_teacache: 200 | transformer.initialize_teacache(enable_teacache=True, num_steps=steps) 201 | else: 202 | transformer.initialize_teacache(enable_teacache=False) 203 | 204 | def callback(d): 205 | preview = d['denoised'] 206 | preview = vae_decode_fake(preview) 207 | 208 | preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8) 209 | preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c') 210 | 211 | if stream.input_queue.top() == 'end': 212 | stream.output_queue.push(('end', None)) 213 | raise KeyboardInterrupt('User ends the task.') 214 | 215 | current_step = d['i'] + 1 216 | percentage = int(100.0 * current_step / steps) 217 | hint = f'Sampling {current_step}/{steps}' 218 | desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...' 219 | stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint)))) 220 | return 221 | 222 | indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0) 223 | clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1) 224 | clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1) 225 | 226 | clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2) 227 | clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2) 228 | 229 | generated_latents = sample_hunyuan( 230 | transformer=transformer, 231 | sampler='unipc', 232 | width=width, 233 | height=height, 234 | frames=latent_window_size * 4 - 3, 235 | real_guidance_scale=cfg, 236 | distilled_guidance_scale=gs, 237 | guidance_rescale=rs, 238 | # shift=3.0, 239 | num_inference_steps=steps, 240 | generator=rnd, 241 | prompt_embeds=llama_vec, 242 | prompt_embeds_mask=llama_attention_mask, 243 | prompt_poolers=clip_l_pooler, 244 | negative_prompt_embeds=llama_vec_n, 245 | negative_prompt_embeds_mask=llama_attention_mask_n, 246 | negative_prompt_poolers=clip_l_pooler_n, 247 | device=gpu, 248 | dtype=torch.bfloat16, 249 | image_embeddings=image_encoder_last_hidden_state, 250 | latent_indices=latent_indices, 251 | clean_latents=clean_latents, 252 | clean_latent_indices=clean_latent_indices, 253 | clean_latents_2x=clean_latents_2x, 254 | clean_latent_2x_indices=clean_latent_2x_indices, 255 | clean_latents_4x=clean_latents_4x, 256 | clean_latent_4x_indices=clean_latent_4x_indices, 257 | callback=callback, 258 | ) 259 | 260 | total_generated_latent_frames += int(generated_latents.shape[2]) 261 | history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2) 262 | 263 | if not high_vram: 264 | offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8) 265 | load_model_as_complete(vae, target_device=gpu) 266 | 267 | real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] 268 | 269 | if history_pixels is None: 270 | history_pixels = vae_decode(real_history_latents, vae).cpu() 271 | else: 272 | section_latent_frames = latent_window_size * 2 273 | overlapped_frames = latent_window_size * 4 - 3 274 | 275 | current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu() 276 | history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames) 277 | 278 | if not high_vram: 279 | unload_complete_models() 280 | 281 | output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4') 282 | 283 | save_bcthw_as_mp4(history_pixels, output_filename, fps=30) 284 | 285 | print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}') 286 | 287 | stream.output_queue.push(('file', output_filename)) 288 | except: 289 | traceback.print_exc() 290 | 291 | if not high_vram: 292 | unload_complete_models( 293 | text_encoder, text_encoder_2, image_encoder, vae, transformer 294 | ) 295 | 296 | stream.output_queue.push(('end', None)) 297 | return 298 | 299 | 300 | def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): 301 | global stream 302 | assert input_image is not None, 'No input image!' 303 | 304 | yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True) 305 | 306 | stream = AsyncStream() 307 | 308 | async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf) 309 | 310 | output_filename = None 311 | 312 | while True: 313 | flag, data = stream.output_queue.next() 314 | 315 | if flag == 'file': 316 | output_filename = data 317 | yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True) 318 | 319 | if flag == 'progress': 320 | preview, desc, html = data 321 | yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) 322 | 323 | if flag == 'end': 324 | yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False) 325 | break 326 | 327 | 328 | def end_process(): 329 | stream.input_queue.push('end') 330 | 331 | 332 | quick_prompts = [ 333 | 'The girl dances gracefully, with clear movements, full of charm.', 334 | 'A character doing some simple body movements.', 335 | ] 336 | quick_prompts = [[x] for x in quick_prompts] 337 | 338 | 339 | css = make_progress_bar_css() 340 | block = gr.Blocks(css=css).queue() 341 | with block: 342 | gr.Markdown('# FramePack-F1') 343 | with gr.Row(): 344 | with gr.Column(): 345 | input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320) 346 | prompt = gr.Textbox(label="Prompt", value='') 347 | example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt]) 348 | example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False) 349 | 350 | with gr.Row(): 351 | start_button = gr.Button(value="Start Generation") 352 | end_button = gr.Button(value="End Generation", interactive=False) 353 | 354 | with gr.Group(): 355 | use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.') 356 | 357 | n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used 358 | seed = gr.Number(label="Seed", value=31337, precision=0) 359 | 360 | total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1) 361 | latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change 362 | steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.') 363 | 364 | cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change 365 | gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.') 366 | rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change 367 | 368 | gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.") 369 | 370 | mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ") 371 | 372 | with gr.Column(): 373 | preview_image = gr.Image(label="Next Latents", height=200, visible=False) 374 | result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True) 375 | progress_desc = gr.Markdown('', elem_classes='no-generating-animation') 376 | progress_bar = gr.HTML('', elem_classes='no-generating-animation') 377 | 378 | gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') 379 | 380 | ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf] 381 | start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) 382 | end_button.click(fn=end_process) 383 | 384 | 385 | block.launch( 386 | server_name=args.server, 387 | server_port=args.port, 388 | share=args.share, 389 | inbrowser=args.inbrowser, 390 | ) -------------------------------------------------------------------------------- /demo_gradio_k.py: -------------------------------------------------------------------------------- 1 | from diffusers_helper.hf_login import login 2 | 3 | import os 4 | 5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))) 6 | 7 | import gradio as gr 8 | import torch 9 | import traceback 10 | import einops 11 | import safetensors.torch as sf 12 | import numpy as np 13 | import argparse 14 | import math 15 | 16 | from PIL import Image 17 | from diffusers import AutoencoderKLHunyuanVideo 18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer 19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake 20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp 21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked 22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan 23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete 24 | from diffusers_helper.thread_utils import AsyncStream, async_run 25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html 26 | from transformers import SiglipImageProcessor, SiglipVisionModel 27 | from diffusers_helper.clip_vision import hf_clip_vision_encode 28 | from diffusers_helper.bucket_tools import find_nearest_bucket 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--share', action='store_true') 33 | parser.add_argument("--server", type=str, default='0.0.0.0') 34 | parser.add_argument("--port", type=int, required=False) 35 | parser.add_argument("--inbrowser", action='store_true') 36 | args = parser.parse_args() 37 | 38 | # for win desktop probably use --server 127.0.0.1 --inbrowser 39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags 40 | print(args) 41 | 42 | free_mem_gb = get_cuda_free_memory_gb(gpu) 43 | high_vram = free_mem_gb > 60 44 | 45 | print(f'Free VRAM {free_mem_gb} GB') 46 | print(f'High-VRAM Mode: {high_vram}') 47 | 48 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu() 49 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu() 50 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer') 51 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2') 52 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu() 53 | 54 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor') 55 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu() 56 | 57 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu() 58 | 59 | vae.eval() 60 | text_encoder.eval() 61 | text_encoder_2.eval() 62 | image_encoder.eval() 63 | transformer.eval() 64 | 65 | if not high_vram: 66 | vae.enable_slicing() 67 | vae.enable_tiling() 68 | 69 | transformer.high_quality_fp32_output_for_inference = True 70 | print('transformer.high_quality_fp32_output_for_inference = True') 71 | 72 | transformer.to(dtype=torch.bfloat16) 73 | vae.to(dtype=torch.float16) 74 | image_encoder.to(dtype=torch.float16) 75 | text_encoder.to(dtype=torch.float16) 76 | text_encoder_2.to(dtype=torch.float16) 77 | 78 | vae.requires_grad_(False) 79 | text_encoder.requires_grad_(False) 80 | text_encoder_2.requires_grad_(False) 81 | image_encoder.requires_grad_(False) 82 | transformer.requires_grad_(False) 83 | 84 | if not high_vram: 85 | # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster 86 | DynamicSwapInstaller.install_model(transformer, device=gpu) 87 | DynamicSwapInstaller.install_model(text_encoder, device=gpu) 88 | else: 89 | text_encoder.to(gpu) 90 | text_encoder_2.to(gpu) 91 | image_encoder.to(gpu) 92 | vae.to(gpu) 93 | transformer.to(gpu) 94 | 95 | stream = AsyncStream() 96 | 97 | outputs_folder = './outputs/' 98 | os.makedirs(outputs_folder, exist_ok=True) 99 | 100 | 101 | @torch.no_grad() 102 | def worker(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): 103 | total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) 104 | total_latent_sections = int(max(round(total_latent_sections), 1)) 105 | 106 | job_id = generate_timestamp() 107 | 108 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...')))) 109 | 110 | try: 111 | # Clean GPU 112 | if not high_vram: 113 | unload_complete_models( 114 | text_encoder, text_encoder_2, image_encoder, vae, transformer 115 | ) 116 | 117 | # Text encoding 118 | 119 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))) 120 | 121 | if not high_vram: 122 | fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. 123 | load_model_as_complete(text_encoder_2, target_device=gpu) 124 | 125 | llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) 126 | 127 | if cfg == 1: 128 | llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler) 129 | else: 130 | llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) 131 | 132 | llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512) 133 | llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512) 134 | 135 | # Processing input image (start frame) 136 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...')))) 137 | 138 | H, W, C = input_image.shape 139 | height, width = find_nearest_bucket(H, W, resolution=640) 140 | input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) 141 | 142 | Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_start.png')) 143 | 144 | input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1 145 | input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None] 146 | 147 | # Processing end image (if provided) 148 | has_end_image = end_image is not None 149 | if has_end_image: 150 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...')))) 151 | 152 | H_end, W_end, C_end = end_image.shape 153 | end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height) 154 | 155 | Image.fromarray(end_image_np).save(os.path.join(outputs_folder, f'{job_id}_end.png')) 156 | 157 | end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1 158 | end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None] 159 | 160 | # VAE encoding 161 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...')))) 162 | 163 | if not high_vram: 164 | load_model_as_complete(vae, target_device=gpu) 165 | 166 | start_latent = vae_encode(input_image_pt, vae) 167 | 168 | if has_end_image: 169 | end_latent = vae_encode(end_image_pt, vae) 170 | 171 | # CLIP Vision 172 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...')))) 173 | 174 | if not high_vram: 175 | load_model_as_complete(image_encoder, target_device=gpu) 176 | 177 | image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder) 178 | image_encoder_last_hidden_state = image_encoder_output.last_hidden_state 179 | 180 | if has_end_image: 181 | end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder) 182 | end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state 183 | # Combine both image embeddings or use a weighted approach 184 | image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2 185 | 186 | # Dtype 187 | llama_vec = llama_vec.to(transformer.dtype) 188 | llama_vec_n = llama_vec_n.to(transformer.dtype) 189 | clip_l_pooler = clip_l_pooler.to(transformer.dtype) 190 | clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype) 191 | image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype) 192 | 193 | # Sampling 194 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...')))) 195 | 196 | rnd = torch.Generator("cpu").manual_seed(seed) 197 | num_frames = latent_window_size * 4 - 3 198 | 199 | history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu() 200 | history_pixels = None 201 | total_generated_latent_frames = 0 202 | 203 | # 将迭代器转换为列表 204 | latent_paddings = list(reversed(range(total_latent_sections))) 205 | 206 | if total_latent_sections > 4: 207 | # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some 208 | # items looks better than expanding it when total_latent_sections > 4 209 | # One can try to remove below trick and just 210 | # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare 211 | latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0] 212 | 213 | for latent_padding in latent_paddings: 214 | is_last_section = latent_padding == 0 215 | is_first_section = latent_padding == latent_paddings[0] 216 | latent_padding_size = latent_padding * latent_window_size 217 | 218 | if stream.input_queue.top() == 'end': 219 | stream.output_queue.push(('end', None)) 220 | return 221 | 222 | print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}') 223 | 224 | indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0) 225 | clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1) 226 | clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1) 227 | 228 | clean_latents_pre = start_latent.to(history_latents) 229 | clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2) 230 | clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) 231 | 232 | # Use end image latent for the first section if provided 233 | if has_end_image and is_first_section: 234 | clean_latents_post = end_latent.to(history_latents) 235 | clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) 236 | 237 | if not high_vram: 238 | unload_complete_models() 239 | move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation) 240 | 241 | if use_teacache: 242 | transformer.initialize_teacache(enable_teacache=True, num_steps=steps) 243 | else: 244 | transformer.initialize_teacache(enable_teacache=False) 245 | 246 | def callback(d): 247 | preview = d['denoised'] 248 | preview = vae_decode_fake(preview) 249 | 250 | preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8) 251 | preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c') 252 | 253 | if stream.input_queue.top() == 'end': 254 | stream.output_queue.push(('end', None)) 255 | raise KeyboardInterrupt('User ends the task.') 256 | 257 | current_step = d['i'] + 1 258 | percentage = int(100.0 * current_step / steps) 259 | hint = f'Sampling {current_step}/{steps}' 260 | desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...' 261 | stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint)))) 262 | return 263 | 264 | generated_latents = sample_hunyuan( 265 | transformer=transformer, 266 | sampler='unipc', 267 | width=width, 268 | height=height, 269 | frames=num_frames, 270 | real_guidance_scale=cfg, 271 | distilled_guidance_scale=gs, 272 | guidance_rescale=rs, 273 | # shift=3.0, 274 | num_inference_steps=steps, 275 | generator=rnd, 276 | prompt_embeds=llama_vec, 277 | prompt_embeds_mask=llama_attention_mask, 278 | prompt_poolers=clip_l_pooler, 279 | negative_prompt_embeds=llama_vec_n, 280 | negative_prompt_embeds_mask=llama_attention_mask_n, 281 | negative_prompt_poolers=clip_l_pooler_n, 282 | device=gpu, 283 | dtype=torch.bfloat16, 284 | image_embeddings=image_encoder_last_hidden_state, 285 | latent_indices=latent_indices, 286 | clean_latents=clean_latents, 287 | clean_latent_indices=clean_latent_indices, 288 | clean_latents_2x=clean_latents_2x, 289 | clean_latent_2x_indices=clean_latent_2x_indices, 290 | clean_latents_4x=clean_latents_4x, 291 | clean_latent_4x_indices=clean_latent_4x_indices, 292 | callback=callback, 293 | ) 294 | 295 | if is_last_section: 296 | generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2) 297 | 298 | total_generated_latent_frames += int(generated_latents.shape[2]) 299 | history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2) 300 | 301 | if not high_vram: 302 | offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8) 303 | load_model_as_complete(vae, target_device=gpu) 304 | 305 | real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :] 306 | 307 | if history_pixels is None: 308 | history_pixels = vae_decode(real_history_latents, vae).cpu() 309 | else: 310 | section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2) 311 | overlapped_frames = latent_window_size * 4 - 3 312 | 313 | current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu() 314 | history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames) 315 | 316 | if not high_vram: 317 | unload_complete_models() 318 | 319 | output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4') 320 | 321 | save_bcthw_as_mp4(history_pixels, output_filename, fps=30) 322 | 323 | print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}') 324 | 325 | stream.output_queue.push(('file', output_filename)) 326 | 327 | if is_last_section: 328 | break 329 | except: 330 | traceback.print_exc() 331 | 332 | if not high_vram: 333 | unload_complete_models( 334 | text_encoder, text_encoder_2, image_encoder, vae, transformer 335 | ) 336 | 337 | stream.output_queue.push(('end', None)) 338 | return 339 | 340 | 341 | def process(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): 342 | global stream 343 | assert input_image is not None, 'No input image!' 344 | 345 | yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True) 346 | 347 | stream = AsyncStream() 348 | 349 | async_run(worker, input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf) 350 | 351 | output_filename = None 352 | 353 | while True: 354 | flag, data = stream.output_queue.next() 355 | 356 | if flag == 'file': 357 | output_filename = data 358 | yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True) 359 | 360 | if flag == 'progress': 361 | preview, desc, html = data 362 | yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) 363 | 364 | if flag == 'end': 365 | yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False) 366 | break 367 | 368 | 369 | def end_process(): 370 | stream.input_queue.push('end') 371 | 372 | 373 | quick_prompts = [ 374 | 'The girl dances gracefully, with clear movements, full of charm.', 375 | 'A character doing some simple body movements.', 376 | ] 377 | quick_prompts = [[x] for x in quick_prompts] 378 | 379 | 380 | css = make_progress_bar_css() 381 | block = gr.Blocks(css=css).queue() 382 | with block: 383 | gr.Markdown('# FramePack') 384 | with gr.Row(): 385 | with gr.Column(): 386 | with gr.Row(): 387 | with gr.Column(): 388 | input_image = gr.Image(sources='upload', type="numpy", label="Start Frame", height=320) 389 | with gr.Column(): 390 | end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320) 391 | 392 | prompt = gr.Textbox(label="Prompt", value='') 393 | example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt]) 394 | example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False) 395 | 396 | with gr.Row(): 397 | start_button = gr.Button(value="Start Generation") 398 | end_button = gr.Button(value="End Generation", interactive=False) 399 | 400 | with gr.Group(): 401 | use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.') 402 | 403 | n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used 404 | seed = gr.Number(label="Seed", value=31337, precision=0) 405 | 406 | total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1) 407 | latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change 408 | steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.') 409 | 410 | cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change 411 | gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.') 412 | rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change 413 | 414 | gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.") 415 | 416 | mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ") 417 | 418 | with gr.Column(): 419 | preview_image = gr.Image(label="Next Latents", height=200, visible=False) 420 | result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True) 421 | gr.Markdown('When using only a start frame, the ending actions will be generated before the starting actions due to the inverted sampling. If using both start and end frames, the model will try to create a smooth transition between them.') 422 | progress_desc = gr.Markdown('', elem_classes='no-generating-animation') 423 | progress_bar = gr.HTML('', elem_classes='no-generating-animation') 424 | 425 | gr.HTML('
Share your results and find ideas at the FramePack Twitter (X) thread
') 426 | 427 | ips = [input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf] 428 | start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]) 429 | end_button.click(fn=end_process) 430 | 431 | 432 | block.launch( 433 | server_name=args.server, 434 | server_port=args.port, 435 | share=args.share, 436 | inbrowser=args.inbrowser, 437 | ) 438 | --------------------------------------------------------------------------------