├── icon.png
├── README.md
├── reset.js
├── link.js
├── update.js
├── start.js
├── start_f1.js
├── start_k.js
├── pinokio_meta.json
├── install.js
├── torch.js
├── pinokio.js
├── demo_gradio_f1.py
└── demo_gradio_k.py
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinokiofactory/Frame-Pack/HEAD/icon.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FramePack
2 |
3 | A pinokio script for https://github.com/lllyasviel/FramePack
4 |
5 |
--------------------------------------------------------------------------------
/reset.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | run: [{
3 | method: "fs.rm",
4 | params: {
5 | path: "app"
6 | }
7 | }]
8 | }
9 |
--------------------------------------------------------------------------------
/link.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | run: [
3 | {
4 | method: "fs.link",
5 | params: {
6 | venv: "app/env"
7 | }
8 | }
9 | ]
10 | }
11 |
--------------------------------------------------------------------------------
/update.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | run: [{
3 | method: "shell.run",
4 | params: {
5 | message: "git pull"
6 | }
7 | }, {
8 | method: "shell.run",
9 | params: {
10 | path: "app",
11 | message: "git pull"
12 | }
13 | }]
14 | }
15 |
--------------------------------------------------------------------------------
/start.js:
--------------------------------------------------------------------------------
1 | module.exports = async (kernel) => {
2 | const port = await kernel.port()
3 |
4 | console.log("Starting standard version directly")
5 | const scriptToRun = "demo_gradio.py"
6 |
7 | return {
8 | requires: {
9 | bundle: "ai",
10 | },
11 | daemon: true,
12 | run: [
13 | {
14 | method: "shell.run",
15 | params: {
16 | venv: "env",
17 | env: { },
18 | path: "app",
19 | message: [
20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 | ],
22 | on: [{
23 | "event": "/http:\/\/[0-9.:]+/",
24 | "done": true
25 | }]
26 | }
27 | },
28 | {
29 | method: "local.set",
30 | params: {
31 | url: "{{input.event[0]}}"
32 | }
33 | }
34 | ]
35 | };
36 | };
37 |
--------------------------------------------------------------------------------
/start_f1.js:
--------------------------------------------------------------------------------
1 | module.exports = async (kernel) => {
2 | const port = await kernel.port()
3 |
4 | console.log("Starting F1 version directly")
5 | const scriptToRun = "demo_gradio_f1.py"
6 |
7 | return {
8 | requires: {
9 | bundle: "ai",
10 | },
11 | daemon: true,
12 | run: [
13 | {
14 | method: "shell.run",
15 | params: {
16 | venv: "env",
17 | env: { },
18 | path: "app",
19 | message: [
20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 | ],
22 | on: [{
23 | "event": "/http:\/\/\\S+/",
24 | "done": true
25 | }]
26 | }
27 | },
28 | {
29 | method: "local.set",
30 | params: {
31 | url: "{{input.event[0]}}"
32 | }
33 | }
34 | ]
35 | };
36 |
37 | };
38 |
--------------------------------------------------------------------------------
/start_k.js:
--------------------------------------------------------------------------------
1 | module.exports = async (kernel) => {
2 | const port = await kernel.port()
3 |
4 | console.log("Starting standard version directly")
5 | const scriptToRun = "demo_gradio_k.py"
6 |
7 | return {
8 | requires: {
9 | bundle: "ai",
10 | },
11 | daemon: true,
12 | run: [
13 | {
14 | method: "shell.run",
15 | params: {
16 | venv: "env",
17 | env: { },
18 | path: "app",
19 | message: [
20 | `python ${scriptToRun} --server 127.0.0.1 --port ${port}`,
21 | ],
22 | on: [{
23 | "event": "/http:\/\/[0-9.:]+/",
24 | "done": true
25 | }]
26 | }
27 | },
28 | {
29 | method: "local.set",
30 | params: {
31 | url: "{{input.event[0]}}"
32 | }
33 | }
34 | ]
35 | };
36 | };
37 |
--------------------------------------------------------------------------------
/pinokio_meta.json:
--------------------------------------------------------------------------------
1 | {
2 | "posts": [
3 | "https://x.com/cocktailpeanut/status/1913004350970159464",
4 | "https://x.com/cocktailpeanut/status/1912977698642907614",
5 | "https://x.com/joesparks/status/1912986750282272861",
6 | "https://x.com/matze2001/status/1912989768406679901",
7 | "https://x.com/cocktailpeanut/status/1912998525274853592",
8 | "https://x.com/cocktailpeanut/status/1913003111050018864",
9 | "https://x.com/SUP3RMASS1VE/status/1912837447525822592",
10 | "https://x.com/SUP3RMASS1VE/status/1912842398654210420",
11 | "https://x.com/SUP3RMASS1VE/status/1912975317896921253"
12 | ],
13 | "links": [{
14 | "title": "SUP3RMASS1VE (Wrote the launcher)",
15 | "links": [{
16 | "type": "bitcoin",
17 | "value": "1N942jHr6vVuR2KAe2JEf3nN59eR21tpKv"
18 | }, {
19 | "title": "X",
20 | "value": "https://x.com/SUP3RMASS1VE"
21 | }, {
22 | "title": "Github",
23 | "value": "https://github.com/SUP3RMASS1VE"
24 | }, {
25 | "title": "Discord",
26 | "value": "https://discord.gg/mvDcrA57AQ"
27 | }]
28 | }]
29 | }
30 |
--------------------------------------------------------------------------------
/install.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | requires: {
3 | bundle: "ai",
4 | },
5 | run: [
6 | {
7 | method: "shell.run",
8 | params: {
9 | message: [
10 | "git clone https://github.com/lllyasviel/FramePack app",
11 | ]
12 | }
13 | },
14 | {
15 | when: "{{platform === 'win32'}}",
16 | method: "shell.run",
17 | params: {
18 | message: [
19 | "copy /Y demo_gradio_k.py app",
20 | "copy /Y demo_gradio_f1.py app"
21 | ]
22 | },
23 | },
24 | {
25 | when: "{{platform !== 'win32'}}",
26 | method: "shell.run",
27 | params: {
28 | message: [
29 | "cp -f demo_gradio_k.py app",
30 | "cp -f demo_gradio_f1.py app"
31 | ]
32 | },
33 | },
34 | {
35 | method: "script.start",
36 | params: {
37 | uri: "torch.js",
38 | params: {
39 | venv: "env",
40 | path: "app",
41 | // xformers: true,
42 | // triton: true,
43 | // sageattention: true
44 | }
45 | }
46 | },
47 | {
48 | method: "shell.run",
49 | params: {
50 | venv: "env",
51 | path: "app",
52 | message: [
53 | "uv pip install gradio devicetorch",
54 | "uv pip install -r requirements.txt",
55 | "uv pip install hf_xet"
56 | ]
57 | }
58 | },
59 | {
60 | method: 'input',
61 | params: {
62 | title: 'Installation completed',
63 | description: 'Click "Start FramePack (Standard)", "Start FramePack (F1)" or "Start FramePack (Key Frame)" in the left menu to launch the app manually'
64 | }
65 | },
66 | ]
67 | }
68 |
--------------------------------------------------------------------------------
/torch.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | run: [
3 | // windows nvidia
4 | {
5 | "when": "{{platform === 'win32' && gpu === 'nvidia'}}",
6 | "method": "shell.run",
7 | "params": {
8 | "venv": "{{args && args.venv ? args.venv : null}}",
9 | "path": "{{args && args.path ? args.path : '.'}}",
10 | "message": [
11 | "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps",
12 | "uv pip install triton-windows==3.3.1.post19",
13 | "uv pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl",
14 | ]
15 | }
16 | },
17 | // windows amd
18 | {
19 | "when": "{{platform === 'win32' && gpu === 'amd'}}",
20 | "method": "shell.run",
21 | "params": {
22 | "venv": "{{args && args.venv ? args.venv : null}}",
23 | "path": "{{args && args.path ? args.path : '.'}}",
24 | "message": "uv pip install torch-directml torchaudio torchvision numpy==1.26.4"
25 | }
26 | },
27 | // windows cpu
28 | {
29 | "when": "{{platform === 'win32' && (gpu !== 'nvidia' && gpu !== 'amd')}}",
30 | "method": "shell.run",
31 | "params": {
32 | "venv": "{{args && args.venv ? args.venv : null}}",
33 | "path": "{{args && args.path ? args.path : '.'}}",
34 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 numpy==1.26.4"
35 | }
36 | },
37 | // mac
38 | {
39 | "when": "{{platform === 'darwin'}}",
40 | "method": "shell.run",
41 | "params": {
42 | "venv": "{{args && args.venv ? args.venv : null}}",
43 | "path": "{{args && args.path ? args.path : '.'}}",
44 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1"
45 | }
46 | },
47 | // linux nvidia
48 | {
49 | "when": "{{platform === 'linux' && gpu === 'nvidia'}}",
50 | "method": "shell.run",
51 | "params": {
52 | "venv": "{{args && args.venv ? args.venv : null}}",
53 | "path": "{{args && args.path ? args.path : '.'}}",
54 | "message": [
55 | "uv pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 {{args && args.xformers ? 'xformers==0.0.30' : ''}} --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps",
56 | "uv pip install triton",
57 | "uv pip install git+https://github.com/thu-ml/SageAttention.git",
58 | ]
59 | }
60 | },
61 | // linux rocm (amd)
62 | {
63 | "when": "{{platform === 'linux' && gpu === 'amd'}}",
64 | "method": "shell.run",
65 | "params": {
66 | "venv": "{{args && args.venv ? args.venv : null}}",
67 | "path": "{{args && args.path ? args.path : '.'}}",
68 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2"
69 | }
70 | },
71 | // linux cpu
72 | {
73 | "when": "{{platform === 'linux' && (gpu !== 'amd' && gpu !=='nvidia')}}",
74 | "method": "shell.run",
75 | "params": {
76 | "venv": "{{args && args.venv ? args.venv : null}}",
77 | "path": "{{args && args.path ? args.path : '.'}}",
78 | "message": "uv pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu"
79 | }
80 | }
81 | ]
82 | }
83 |
--------------------------------------------------------------------------------
/pinokio.js:
--------------------------------------------------------------------------------
1 | const path = require('path')
2 | module.exports = {
3 | version: "3.7",
4 | title: "FramePack",
5 | description: "[NVIDIA ONLY] Generate Video Progressively. FramePack is a next-frame (next-frame-section) prediction neural network structure that generates videos progressively. https://github.com/lllyasviel/FramePack",
6 | icon: "icon.png",
7 | menu: async (kernel, info) => {
8 | let installed = info.exists("app/env")
9 | let running = {
10 | install: info.running("install.js"),
11 | start: info.running("start.js"),
12 | start_f1: info.running("start_f1.js"),
13 | start_k: info.running("start_k.js"),
14 | update: info.running("update.js"),
15 | reset: info.running("reset.js"),
16 | link: info.running("link.js")
17 | }
18 | if (running.install) {
19 | return [{
20 | default: true,
21 | icon: "fa-solid fa-plug",
22 | text: "Installing",
23 | href: "install.js",
24 | }]
25 | } else if (installed) {
26 | if (running.start || running.start_f1 || running.start_k) {
27 | const script = running.start ? "start.js" : running.start_f1 ? "start_f1.js" : "start_k.js"
28 | let local = info.local(script)
29 | if (local && local.url) {
30 | return [{
31 | default: true,
32 | icon: "fa-solid fa-rocket",
33 | text: "Open Web UI",
34 | href: local.url,
35 | }, {
36 | icon: 'fa-solid fa-terminal',
37 | text: "Terminal",
38 | href: script,
39 | }]
40 | } else {
41 | return [{
42 | default: true,
43 | icon: 'fa-solid fa-terminal',
44 | text: "Terminal",
45 | href: script,
46 | }]
47 | }
48 | } else if (running.update) {
49 | return [{
50 | default: true,
51 | icon: 'fa-solid fa-terminal',
52 | text: "Updating",
53 | href: "update.js",
54 | }]
55 | } else if (running.reset) {
56 | return [{
57 | default: true,
58 | icon: 'fa-solid fa-terminal',
59 | text: "Resetting",
60 | href: "reset.js",
61 | }]
62 | } else if (running.link) {
63 | return [{
64 | default: true,
65 | icon: 'fa-solid fa-terminal',
66 | text: "Deduplicating",
67 | href: "link.js",
68 | }]
69 | } else {
70 | return [{
71 | icon: "fa-solid fa-power-off",
72 | text: "Start FramePack (Standard)",
73 | href: "start.js",
74 | }, {
75 | icon: "fa-solid fa-power-off",
76 | text: "Start FramePack (F1)",
77 | href: "start_f1.js",
78 | }, {
79 | icon: "fa-solid fa-power-off",
80 | text: "Start FramePack (KeyFrame)",
81 | href: "start_k.js",
82 | }, {
83 | icon: "fa-solid fa-plug",
84 | text: "Update",
85 | href: "update.js",
86 | }, {
87 | icon: "fa-solid fa-plug",
88 | text: "Install",
89 | href: "install.js",
90 | }, {
91 | icon: "fa-solid fa-file-zipper",
92 | text: "
Save Disk SpaceDeduplicates redundant library files
",
93 | href: "link.js",
94 | }, {
95 | icon: "fa-regular fa-circle-xmark",
96 | text: "ResetRevert to pre-install state
",
97 | href: "reset.js",
98 | confirm: "Are you sure you wish to reset the app?"
99 |
100 | }]
101 | }
102 | } else {
103 | return [{
104 | default: true,
105 | icon: "fa-solid fa-plug",
106 | text: "Install",
107 | href: "install.js",
108 | }]
109 | }
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/demo_gradio_f1.py:
--------------------------------------------------------------------------------
1 | from diffusers_helper.hf_login import login
2 |
3 | import os
4 |
5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6 |
7 | import gradio as gr
8 | import torch
9 | import traceback
10 | import einops
11 | import safetensors.torch as sf
12 | import numpy as np
13 | import argparse
14 | import math
15 |
16 | from PIL import Image
17 | from diffusers import AutoencoderKLHunyuanVideo
18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24 | from diffusers_helper.thread_utils import AsyncStream, async_run
25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26 | from transformers import SiglipImageProcessor, SiglipVisionModel
27 | from diffusers_helper.clip_vision import hf_clip_vision_encode
28 | from diffusers_helper.bucket_tools import find_nearest_bucket
29 |
30 |
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('--share', action='store_true')
33 | parser.add_argument("--server", type=str, default='0.0.0.0')
34 | parser.add_argument("--port", type=int, required=False)
35 | parser.add_argument("--inbrowser", action='store_true')
36 | args = parser.parse_args()
37 |
38 | # for win desktop probably use --server 127.0.0.1 --inbrowser
39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
40 |
41 | print(args)
42 |
43 | free_mem_gb = get_cuda_free_memory_gb(gpu)
44 | high_vram = free_mem_gb > 60
45 |
46 | print(f'Free VRAM {free_mem_gb} GB')
47 | print(f'High-VRAM Mode: {high_vram}')
48 |
49 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
50 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
51 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
52 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
53 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
54 |
55 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
56 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
57 |
58 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
59 |
60 | vae.eval()
61 | text_encoder.eval()
62 | text_encoder_2.eval()
63 | image_encoder.eval()
64 | transformer.eval()
65 |
66 | if not high_vram:
67 | vae.enable_slicing()
68 | vae.enable_tiling()
69 |
70 | transformer.high_quality_fp32_output_for_inference = True
71 | print('transformer.high_quality_fp32_output_for_inference = True')
72 |
73 | transformer.to(dtype=torch.bfloat16)
74 | vae.to(dtype=torch.float16)
75 | image_encoder.to(dtype=torch.float16)
76 | text_encoder.to(dtype=torch.float16)
77 | text_encoder_2.to(dtype=torch.float16)
78 |
79 | vae.requires_grad_(False)
80 | text_encoder.requires_grad_(False)
81 | text_encoder_2.requires_grad_(False)
82 | image_encoder.requires_grad_(False)
83 | transformer.requires_grad_(False)
84 |
85 | if not high_vram:
86 | # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
87 | DynamicSwapInstaller.install_model(transformer, device=gpu)
88 | DynamicSwapInstaller.install_model(text_encoder, device=gpu)
89 | else:
90 | text_encoder.to(gpu)
91 | text_encoder_2.to(gpu)
92 | image_encoder.to(gpu)
93 | vae.to(gpu)
94 | transformer.to(gpu)
95 |
96 | stream = AsyncStream()
97 |
98 | outputs_folder = './outputs/'
99 | os.makedirs(outputs_folder, exist_ok=True)
100 |
101 |
102 | @torch.no_grad()
103 | def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
104 | total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
105 | total_latent_sections = int(max(round(total_latent_sections), 1))
106 |
107 | job_id = generate_timestamp()
108 |
109 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
110 |
111 | try:
112 | # Clean GPU
113 | if not high_vram:
114 | unload_complete_models(
115 | text_encoder, text_encoder_2, image_encoder, vae, transformer
116 | )
117 |
118 | # Text encoding
119 |
120 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
121 |
122 | if not high_vram:
123 | fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
124 | load_model_as_complete(text_encoder_2, target_device=gpu)
125 |
126 | llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
127 |
128 | if cfg == 1:
129 | llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
130 | else:
131 | llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
132 |
133 | llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
134 | llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
135 |
136 | # Processing input image
137 |
138 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
139 |
140 | H, W, C = input_image.shape
141 | height, width = find_nearest_bucket(H, W, resolution=640)
142 | input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
143 |
144 | Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
145 |
146 | input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
147 | input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
148 |
149 | # VAE encoding
150 |
151 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
152 |
153 | if not high_vram:
154 | load_model_as_complete(vae, target_device=gpu)
155 |
156 | start_latent = vae_encode(input_image_pt, vae)
157 |
158 | # CLIP Vision
159 |
160 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
161 |
162 | if not high_vram:
163 | load_model_as_complete(image_encoder, target_device=gpu)
164 |
165 | image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
166 | image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
167 |
168 | # Dtype
169 |
170 | llama_vec = llama_vec.to(transformer.dtype)
171 | llama_vec_n = llama_vec_n.to(transformer.dtype)
172 | clip_l_pooler = clip_l_pooler.to(transformer.dtype)
173 | clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
174 | image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
175 |
176 | # Sampling
177 |
178 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
179 |
180 | rnd = torch.Generator("cpu").manual_seed(seed)
181 |
182 | history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
183 | history_pixels = None
184 |
185 | history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
186 | total_generated_latent_frames = 1
187 |
188 | for section_index in range(total_latent_sections):
189 | if stream.input_queue.top() == 'end':
190 | stream.output_queue.push(('end', None))
191 | return
192 |
193 | print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
194 |
195 | if not high_vram:
196 | unload_complete_models()
197 | move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
198 |
199 | if use_teacache:
200 | transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
201 | else:
202 | transformer.initialize_teacache(enable_teacache=False)
203 |
204 | def callback(d):
205 | preview = d['denoised']
206 | preview = vae_decode_fake(preview)
207 |
208 | preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
209 | preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
210 |
211 | if stream.input_queue.top() == 'end':
212 | stream.output_queue.push(('end', None))
213 | raise KeyboardInterrupt('User ends the task.')
214 |
215 | current_step = d['i'] + 1
216 | percentage = int(100.0 * current_step / steps)
217 | hint = f'Sampling {current_step}/{steps}'
218 | desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
219 | stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
220 | return
221 |
222 | indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
223 | clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
224 | clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
225 |
226 | clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
227 | clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
228 |
229 | generated_latents = sample_hunyuan(
230 | transformer=transformer,
231 | sampler='unipc',
232 | width=width,
233 | height=height,
234 | frames=latent_window_size * 4 - 3,
235 | real_guidance_scale=cfg,
236 | distilled_guidance_scale=gs,
237 | guidance_rescale=rs,
238 | # shift=3.0,
239 | num_inference_steps=steps,
240 | generator=rnd,
241 | prompt_embeds=llama_vec,
242 | prompt_embeds_mask=llama_attention_mask,
243 | prompt_poolers=clip_l_pooler,
244 | negative_prompt_embeds=llama_vec_n,
245 | negative_prompt_embeds_mask=llama_attention_mask_n,
246 | negative_prompt_poolers=clip_l_pooler_n,
247 | device=gpu,
248 | dtype=torch.bfloat16,
249 | image_embeddings=image_encoder_last_hidden_state,
250 | latent_indices=latent_indices,
251 | clean_latents=clean_latents,
252 | clean_latent_indices=clean_latent_indices,
253 | clean_latents_2x=clean_latents_2x,
254 | clean_latent_2x_indices=clean_latent_2x_indices,
255 | clean_latents_4x=clean_latents_4x,
256 | clean_latent_4x_indices=clean_latent_4x_indices,
257 | callback=callback,
258 | )
259 |
260 | total_generated_latent_frames += int(generated_latents.shape[2])
261 | history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
262 |
263 | if not high_vram:
264 | offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
265 | load_model_as_complete(vae, target_device=gpu)
266 |
267 | real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
268 |
269 | if history_pixels is None:
270 | history_pixels = vae_decode(real_history_latents, vae).cpu()
271 | else:
272 | section_latent_frames = latent_window_size * 2
273 | overlapped_frames = latent_window_size * 4 - 3
274 |
275 | current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
276 | history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
277 |
278 | if not high_vram:
279 | unload_complete_models()
280 |
281 | output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
282 |
283 | save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
284 |
285 | print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
286 |
287 | stream.output_queue.push(('file', output_filename))
288 | except:
289 | traceback.print_exc()
290 |
291 | if not high_vram:
292 | unload_complete_models(
293 | text_encoder, text_encoder_2, image_encoder, vae, transformer
294 | )
295 |
296 | stream.output_queue.push(('end', None))
297 | return
298 |
299 |
300 | def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
301 | global stream
302 | assert input_image is not None, 'No input image!'
303 |
304 | yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
305 |
306 | stream = AsyncStream()
307 |
308 | async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
309 |
310 | output_filename = None
311 |
312 | while True:
313 | flag, data = stream.output_queue.next()
314 |
315 | if flag == 'file':
316 | output_filename = data
317 | yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
318 |
319 | if flag == 'progress':
320 | preview, desc, html = data
321 | yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
322 |
323 | if flag == 'end':
324 | yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
325 | break
326 |
327 |
328 | def end_process():
329 | stream.input_queue.push('end')
330 |
331 |
332 | quick_prompts = [
333 | 'The girl dances gracefully, with clear movements, full of charm.',
334 | 'A character doing some simple body movements.',
335 | ]
336 | quick_prompts = [[x] for x in quick_prompts]
337 |
338 |
339 | css = make_progress_bar_css()
340 | block = gr.Blocks(css=css).queue()
341 | with block:
342 | gr.Markdown('# FramePack-F1')
343 | with gr.Row():
344 | with gr.Column():
345 | input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
346 | prompt = gr.Textbox(label="Prompt", value='')
347 | example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
348 | example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
349 |
350 | with gr.Row():
351 | start_button = gr.Button(value="Start Generation")
352 | end_button = gr.Button(value="End Generation", interactive=False)
353 |
354 | with gr.Group():
355 | use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
356 |
357 | n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
358 | seed = gr.Number(label="Seed", value=31337, precision=0)
359 |
360 | total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
361 | latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
362 | steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
363 |
364 | cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
365 | gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
366 | rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
367 |
368 | gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
369 |
370 | mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
371 |
372 | with gr.Column():
373 | preview_image = gr.Image(label="Next Latents", height=200, visible=False)
374 | result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
375 | progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
376 | progress_bar = gr.HTML('', elem_classes='no-generating-animation')
377 |
378 | gr.HTML('')
379 |
380 | ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
381 | start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
382 | end_button.click(fn=end_process)
383 |
384 |
385 | block.launch(
386 | server_name=args.server,
387 | server_port=args.port,
388 | share=args.share,
389 | inbrowser=args.inbrowser,
390 | )
--------------------------------------------------------------------------------
/demo_gradio_k.py:
--------------------------------------------------------------------------------
1 | from diffusers_helper.hf_login import login
2 |
3 | import os
4 |
5 | os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6 |
7 | import gradio as gr
8 | import torch
9 | import traceback
10 | import einops
11 | import safetensors.torch as sf
12 | import numpy as np
13 | import argparse
14 | import math
15 |
16 | from PIL import Image
17 | from diffusers import AutoencoderKLHunyuanVideo
18 | from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19 | from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20 | from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21 | from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22 | from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23 | from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24 | from diffusers_helper.thread_utils import AsyncStream, async_run
25 | from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26 | from transformers import SiglipImageProcessor, SiglipVisionModel
27 | from diffusers_helper.clip_vision import hf_clip_vision_encode
28 | from diffusers_helper.bucket_tools import find_nearest_bucket
29 |
30 |
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('--share', action='store_true')
33 | parser.add_argument("--server", type=str, default='0.0.0.0')
34 | parser.add_argument("--port", type=int, required=False)
35 | parser.add_argument("--inbrowser", action='store_true')
36 | args = parser.parse_args()
37 |
38 | # for win desktop probably use --server 127.0.0.1 --inbrowser
39 | # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
40 | print(args)
41 |
42 | free_mem_gb = get_cuda_free_memory_gb(gpu)
43 | high_vram = free_mem_gb > 60
44 |
45 | print(f'Free VRAM {free_mem_gb} GB')
46 | print(f'High-VRAM Mode: {high_vram}')
47 |
48 | text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
49 | text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
50 | tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
51 | tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
52 | vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
53 |
54 | feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
55 | image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
56 |
57 | transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
58 |
59 | vae.eval()
60 | text_encoder.eval()
61 | text_encoder_2.eval()
62 | image_encoder.eval()
63 | transformer.eval()
64 |
65 | if not high_vram:
66 | vae.enable_slicing()
67 | vae.enable_tiling()
68 |
69 | transformer.high_quality_fp32_output_for_inference = True
70 | print('transformer.high_quality_fp32_output_for_inference = True')
71 |
72 | transformer.to(dtype=torch.bfloat16)
73 | vae.to(dtype=torch.float16)
74 | image_encoder.to(dtype=torch.float16)
75 | text_encoder.to(dtype=torch.float16)
76 | text_encoder_2.to(dtype=torch.float16)
77 |
78 | vae.requires_grad_(False)
79 | text_encoder.requires_grad_(False)
80 | text_encoder_2.requires_grad_(False)
81 | image_encoder.requires_grad_(False)
82 | transformer.requires_grad_(False)
83 |
84 | if not high_vram:
85 | # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
86 | DynamicSwapInstaller.install_model(transformer, device=gpu)
87 | DynamicSwapInstaller.install_model(text_encoder, device=gpu)
88 | else:
89 | text_encoder.to(gpu)
90 | text_encoder_2.to(gpu)
91 | image_encoder.to(gpu)
92 | vae.to(gpu)
93 | transformer.to(gpu)
94 |
95 | stream = AsyncStream()
96 |
97 | outputs_folder = './outputs/'
98 | os.makedirs(outputs_folder, exist_ok=True)
99 |
100 |
101 | @torch.no_grad()
102 | def worker(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
103 | total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
104 | total_latent_sections = int(max(round(total_latent_sections), 1))
105 |
106 | job_id = generate_timestamp()
107 |
108 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
109 |
110 | try:
111 | # Clean GPU
112 | if not high_vram:
113 | unload_complete_models(
114 | text_encoder, text_encoder_2, image_encoder, vae, transformer
115 | )
116 |
117 | # Text encoding
118 |
119 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
120 |
121 | if not high_vram:
122 | fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
123 | load_model_as_complete(text_encoder_2, target_device=gpu)
124 |
125 | llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
126 |
127 | if cfg == 1:
128 | llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
129 | else:
130 | llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
131 |
132 | llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
133 | llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
134 |
135 | # Processing input image (start frame)
136 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...'))))
137 |
138 | H, W, C = input_image.shape
139 | height, width = find_nearest_bucket(H, W, resolution=640)
140 | input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
141 |
142 | Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_start.png'))
143 |
144 | input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
145 | input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
146 |
147 | # Processing end image (if provided)
148 | has_end_image = end_image is not None
149 | if has_end_image:
150 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...'))))
151 |
152 | H_end, W_end, C_end = end_image.shape
153 | end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
154 |
155 | Image.fromarray(end_image_np).save(os.path.join(outputs_folder, f'{job_id}_end.png'))
156 |
157 | end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
158 | end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
159 |
160 | # VAE encoding
161 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
162 |
163 | if not high_vram:
164 | load_model_as_complete(vae, target_device=gpu)
165 |
166 | start_latent = vae_encode(input_image_pt, vae)
167 |
168 | if has_end_image:
169 | end_latent = vae_encode(end_image_pt, vae)
170 |
171 | # CLIP Vision
172 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
173 |
174 | if not high_vram:
175 | load_model_as_complete(image_encoder, target_device=gpu)
176 |
177 | image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
178 | image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
179 |
180 | if has_end_image:
181 | end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder)
182 | end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
183 | # Combine both image embeddings or use a weighted approach
184 | image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
185 |
186 | # Dtype
187 | llama_vec = llama_vec.to(transformer.dtype)
188 | llama_vec_n = llama_vec_n.to(transformer.dtype)
189 | clip_l_pooler = clip_l_pooler.to(transformer.dtype)
190 | clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
191 | image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
192 |
193 | # Sampling
194 | stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
195 |
196 | rnd = torch.Generator("cpu").manual_seed(seed)
197 | num_frames = latent_window_size * 4 - 3
198 |
199 | history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
200 | history_pixels = None
201 | total_generated_latent_frames = 0
202 |
203 | # 将迭代器转换为列表
204 | latent_paddings = list(reversed(range(total_latent_sections)))
205 |
206 | if total_latent_sections > 4:
207 | # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
208 | # items looks better than expanding it when total_latent_sections > 4
209 | # One can try to remove below trick and just
210 | # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
211 | latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
212 |
213 | for latent_padding in latent_paddings:
214 | is_last_section = latent_padding == 0
215 | is_first_section = latent_padding == latent_paddings[0]
216 | latent_padding_size = latent_padding * latent_window_size
217 |
218 | if stream.input_queue.top() == 'end':
219 | stream.output_queue.push(('end', None))
220 | return
221 |
222 | print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
223 |
224 | indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
225 | clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
226 | clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
227 |
228 | clean_latents_pre = start_latent.to(history_latents)
229 | clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
230 | clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
231 |
232 | # Use end image latent for the first section if provided
233 | if has_end_image and is_first_section:
234 | clean_latents_post = end_latent.to(history_latents)
235 | clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
236 |
237 | if not high_vram:
238 | unload_complete_models()
239 | move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
240 |
241 | if use_teacache:
242 | transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
243 | else:
244 | transformer.initialize_teacache(enable_teacache=False)
245 |
246 | def callback(d):
247 | preview = d['denoised']
248 | preview = vae_decode_fake(preview)
249 |
250 | preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
251 | preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
252 |
253 | if stream.input_queue.top() == 'end':
254 | stream.output_queue.push(('end', None))
255 | raise KeyboardInterrupt('User ends the task.')
256 |
257 | current_step = d['i'] + 1
258 | percentage = int(100.0 * current_step / steps)
259 | hint = f'Sampling {current_step}/{steps}'
260 | desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
261 | stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
262 | return
263 |
264 | generated_latents = sample_hunyuan(
265 | transformer=transformer,
266 | sampler='unipc',
267 | width=width,
268 | height=height,
269 | frames=num_frames,
270 | real_guidance_scale=cfg,
271 | distilled_guidance_scale=gs,
272 | guidance_rescale=rs,
273 | # shift=3.0,
274 | num_inference_steps=steps,
275 | generator=rnd,
276 | prompt_embeds=llama_vec,
277 | prompt_embeds_mask=llama_attention_mask,
278 | prompt_poolers=clip_l_pooler,
279 | negative_prompt_embeds=llama_vec_n,
280 | negative_prompt_embeds_mask=llama_attention_mask_n,
281 | negative_prompt_poolers=clip_l_pooler_n,
282 | device=gpu,
283 | dtype=torch.bfloat16,
284 | image_embeddings=image_encoder_last_hidden_state,
285 | latent_indices=latent_indices,
286 | clean_latents=clean_latents,
287 | clean_latent_indices=clean_latent_indices,
288 | clean_latents_2x=clean_latents_2x,
289 | clean_latent_2x_indices=clean_latent_2x_indices,
290 | clean_latents_4x=clean_latents_4x,
291 | clean_latent_4x_indices=clean_latent_4x_indices,
292 | callback=callback,
293 | )
294 |
295 | if is_last_section:
296 | generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
297 |
298 | total_generated_latent_frames += int(generated_latents.shape[2])
299 | history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
300 |
301 | if not high_vram:
302 | offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
303 | load_model_as_complete(vae, target_device=gpu)
304 |
305 | real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
306 |
307 | if history_pixels is None:
308 | history_pixels = vae_decode(real_history_latents, vae).cpu()
309 | else:
310 | section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
311 | overlapped_frames = latent_window_size * 4 - 3
312 |
313 | current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
314 | history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
315 |
316 | if not high_vram:
317 | unload_complete_models()
318 |
319 | output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
320 |
321 | save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
322 |
323 | print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
324 |
325 | stream.output_queue.push(('file', output_filename))
326 |
327 | if is_last_section:
328 | break
329 | except:
330 | traceback.print_exc()
331 |
332 | if not high_vram:
333 | unload_complete_models(
334 | text_encoder, text_encoder_2, image_encoder, vae, transformer
335 | )
336 |
337 | stream.output_queue.push(('end', None))
338 | return
339 |
340 |
341 | def process(input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
342 | global stream
343 | assert input_image is not None, 'No input image!'
344 |
345 | yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
346 |
347 | stream = AsyncStream()
348 |
349 | async_run(worker, input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
350 |
351 | output_filename = None
352 |
353 | while True:
354 | flag, data = stream.output_queue.next()
355 |
356 | if flag == 'file':
357 | output_filename = data
358 | yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
359 |
360 | if flag == 'progress':
361 | preview, desc, html = data
362 | yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
363 |
364 | if flag == 'end':
365 | yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
366 | break
367 |
368 |
369 | def end_process():
370 | stream.input_queue.push('end')
371 |
372 |
373 | quick_prompts = [
374 | 'The girl dances gracefully, with clear movements, full of charm.',
375 | 'A character doing some simple body movements.',
376 | ]
377 | quick_prompts = [[x] for x in quick_prompts]
378 |
379 |
380 | css = make_progress_bar_css()
381 | block = gr.Blocks(css=css).queue()
382 | with block:
383 | gr.Markdown('# FramePack')
384 | with gr.Row():
385 | with gr.Column():
386 | with gr.Row():
387 | with gr.Column():
388 | input_image = gr.Image(sources='upload', type="numpy", label="Start Frame", height=320)
389 | with gr.Column():
390 | end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
391 |
392 | prompt = gr.Textbox(label="Prompt", value='')
393 | example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
394 | example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
395 |
396 | with gr.Row():
397 | start_button = gr.Button(value="Start Generation")
398 | end_button = gr.Button(value="End Generation", interactive=False)
399 |
400 | with gr.Group():
401 | use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
402 |
403 | n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
404 | seed = gr.Number(label="Seed", value=31337, precision=0)
405 |
406 | total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
407 | latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
408 | steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
409 |
410 | cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
411 | gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
412 | rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
413 |
414 | gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
415 |
416 | mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
417 |
418 | with gr.Column():
419 | preview_image = gr.Image(label="Next Latents", height=200, visible=False)
420 | result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
421 | gr.Markdown('When using only a start frame, the ending actions will be generated before the starting actions due to the inverted sampling. If using both start and end frames, the model will try to create a smooth transition between them.')
422 | progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
423 | progress_bar = gr.HTML('', elem_classes='no-generating-animation')
424 |
425 | gr.HTML('')
426 |
427 | ips = [input_image, end_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
428 | start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
429 | end_button.click(fn=end_process)
430 |
431 |
432 | block.launch(
433 | server_name=args.server,
434 | server_port=args.port,
435 | share=args.share,
436 | inbrowser=args.inbrowser,
437 | )
438 |
--------------------------------------------------------------------------------