├── .github
└── FUNDING.yml
├── Animate_X.ipynb
├── Basic_Wan2_1_VACE_&_CausVid_LoRA_4_Image_to_Video.ipynb
├── Basic_Wan2_1_VACE_&_CausVid_LoRA_4_Text_to_Video(WIP).ipynb
├── Fast_Wan2_1_14B_I2V_480p_GGUF_&_LoRA.ipynb
├── Flux_Upscaler.ipynb
├── FrameInterpolationRIFE.ipynb
├── FramePack.ipynb
├── Hidream_T2V_GGUF_Q5.ipynb
├── Hidream_fp8.ipynb
├── ICEdit.ipynb
├── LTXV_0_9_7_13B_Distilled_Image_to_Video.ipynb
├── LTX_Video_Img_to_Vid.ipynb
├── LTX_Video_Tx_to_Vid.ipynb
├── LTX_Video_with_Start_&_End_frames.ipynb
├── LatentSync.ipynb
├── README.md
├── Wan2_1_14B_I2V_GGUF_&_LoRA.ipynb
├── Wan2_1_14B_I2V_GGUF_Free.ipynb
├── Wan2_1_14B_T2V_GGUF_Free.ipynb
├── Wan2_1_1_3B_T2V_Free.ipynb
├── Wan2_1_I2V_14B.ipynb
├── Wan2_1_T2V_14B.ipynb
├── Wan2_1_T2V_1_3B_DiffSynth.ipynb
├── ZonosTTS.ipynb
└── video2Gif_(Basic).ipynb
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: isiomo # Replace with a single Buy Me a Coffee username
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 |
--------------------------------------------------------------------------------
/FrameInterpolationRIFE.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "\n",
23 | "\n",
24 | "**SETUP ENVIRONMENT**\n"
25 | ],
26 | "metadata": {
27 | "id": "YCtg1bUxGXeB"
28 | }
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "id": "CVRBIyZ7gVso",
35 | "cellView": "form"
36 | },
37 | "outputs": [],
38 | "source": [
39 | "# @title\n",
40 | "!git clone https://github.com/Isi-dev/Practical-RIFE\n",
41 | "%cd /content/Practical-RIFE\n",
42 | "!pip install git+https://github.com/rk-exxec/scikit-video.git@numpy_deprecation\n",
43 | "!mkdir -p /content/Practical-RIFE/train_log\n",
44 | "!wget -q https://huggingface.co/Isi99999/Frame_Interpolation_Models/resolve/main/4.25/train_log/IFNet_HDv3.py -O /content/Practical-RIFE/train_log/IFNet_HDv3.py\n",
45 | "!wget -q https://huggingface.co/Isi99999/Frame_Interpolation_Models/resolve/main/4.25/train_log/RIFE_HDv3.py -O /content/Practical-RIFE/train_log/RIFE_HDv3.py\n",
46 | "!wget -q https://huggingface.co/Isi99999/Frame_Interpolation_Models/resolve/main/4.25/train_log/refine.py -O /content/Practical-RIFE/train_log/refine.py\n",
47 | "!wget -q https://huggingface.co/Isi99999/Frame_Interpolation_Models/resolve/main/4.25/train_log/flownet.pkl -O /content/Practical-RIFE/train_log/flownet.pkl\n",
48 | "!echo \"✅ All setup steps completed successfully!\"\n"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "source": [
54 | "**IMAGE INTERPOLATION to GIF**"
55 | ],
56 | "metadata": {
57 | "id": "8upil1YGGlLn"
58 | }
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "source": [
63 | "Run and Upload Two Images. Make sure both images have the same alpha channel profile. If one image is RGB and the other is RGBA, the interpolation will fail. The number of frames in the output gif file = 2 exponential(EXP). FPS is the number of Frames Per Second displayed by the output gif file."
64 | ],
65 | "metadata": {
66 | "id": "81GMlI0pGqHG"
67 | }
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "# @title\n",
73 | "EXP = 4 # @param {\"type\":\"number\"}\n",
74 | "FPS = 10 # @param {\"type\":\"number\"}\n",
75 | "\n",
76 | "\n",
77 | "from google.colab import files\n",
78 | "import os\n",
79 | "import shutil\n",
80 | "# Delete the existing output folder if it exists\n",
81 | "output_dir = \"/content/Practical-RIFE/output\"\n",
82 | "if os.path.exists(output_dir):\n",
83 | " shutil.rmtree(output_dir)\n",
84 | "from PIL import Image\n",
85 | "uploaded = files.upload()\n",
86 | "target_dir = \"/content/Practical-RIFE\"\n",
87 | "os.makedirs(target_dir, exist_ok=True)\n",
88 | "\n",
89 | "image_paths = []\n",
90 | "for i, (filename, file_data) in enumerate(uploaded.items()):\n",
91 | " ext = os.path.splitext(filename)[1].lower()\n",
92 | " image_path = os.path.join(target_dir, f\"{i}{ext}\")\n",
93 | "\n",
94 | " with open(image_path, \"wb\") as f:\n",
95 | " f.write(file_data)\n",
96 | "\n",
97 | " image_paths.append(image_path)\n",
98 | "\n",
99 | "image_0 = image_paths[0] if len(image_paths) > 0 else None\n",
100 | "image_1 = image_paths[1] if len(image_paths) > 1 else None\n",
101 | "\n",
102 | "if image_0 is not None and image_1 is not None:\n",
103 | " with Image.open(image_0) as img:\n",
104 | " width, height = img.size\n",
105 | " !python3 inference_img.py --img {image_0} {image_1} --exp {EXP}\n",
106 | " !ffmpeg -r {FPS} -f image2 -i output/img%d.png -s {width}x{height} -vf \"split[s0][s1];[s0]palettegen=stats_mode=single[p];[s1][p]paletteuse=new=1\" output/slomo.gif -loglevel error -y\n",
107 | " from IPython.display import display, Image\n",
108 | " display(Image(filename=\"output/slomo.gif\"))\n",
109 | "else:\n",
110 | " print(\"You need to select two images!\")"
111 | ],
112 | "metadata": {
113 | "cellView": "form",
114 | "id": "Q667ddsZGonP"
115 | },
116 | "execution_count": null,
117 | "outputs": []
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "source": [
122 | "**IMAGE INTERPOLATION to VIDEO**"
123 | ],
124 | "metadata": {
125 | "id": "lZ896eaQaGXt"
126 | }
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "source": [
131 | "Run and Upload Two Images. Make sure both images have the same alpha channel profile. If one image is RGB and the other is RGBA, the interpolation will fail. The number of frames in the output Video = 2 exponential(EXP). FPS is the number of Frames Per Second of the output video."
132 | ],
133 | "metadata": {
134 | "id": "XsBCbd6-naMN"
135 | }
136 | },
137 | {
138 | "cell_type": "code",
139 | "source": [
140 | "# @title\n",
141 | "EXP = 4 # @param {\"type\":\"number\"}\n",
142 | "FPS = 30 # @param {\"type\":\"number\"}\n",
143 | "\n",
144 | "\n",
145 | "from google.colab import files\n",
146 | "import os\n",
147 | "import shutil\n",
148 | "# Delete the existing output folder if it exists\n",
149 | "output_dir = \"/content/Practical-RIFE/output\"\n",
150 | "if os.path.exists(output_dir):\n",
151 | " shutil.rmtree(output_dir)\n",
152 | "from PIL import Image\n",
153 | "uploaded = files.upload()\n",
154 | "target_dir = \"/content/Practical-RIFE\"\n",
155 | "os.makedirs(target_dir, exist_ok=True)\n",
156 | "\n",
157 | "image_paths = []\n",
158 | "for i, (filename, file_data) in enumerate(uploaded.items()):\n",
159 | " ext = os.path.splitext(filename)[1].lower()\n",
160 | " image_path = os.path.join(target_dir, f\"{i}{ext}\")\n",
161 | "\n",
162 | " with open(image_path, \"wb\") as f:\n",
163 | " f.write(file_data)\n",
164 | "\n",
165 | " image_paths.append(image_path)\n",
166 | "\n",
167 | "image_0 = image_paths[0] if len(image_paths) > 0 else None\n",
168 | "image_1 = image_paths[1] if len(image_paths) > 1 else None\n",
169 | "\n",
170 | "if image_0 is not None and image_1 is not None:\n",
171 | " with Image.open(image_0) as img:\n",
172 | " width, height = img.size\n",
173 | " !python3 inference_img.py --img {image_0} {image_1} --exp {EXP}\n",
174 | " !ffmpeg -r {FPS} -f image2 -i output/img%d.png -s {width}x{height} -vcodec libx264 -crf 23 -preset fast -pix_fmt yuv420p output/vid_from_images.mp4 -loglevel error -y\n",
175 | " from IPython.display import display as displayVid, Video as outVid\n",
176 | " displayVid(outVid(\"output/vid_from_images.mp4\", embed=True))\n",
177 | "else:\n",
178 | " print(\"You need to select two images!\")"
179 | ],
180 | "metadata": {
181 | "cellView": "form",
182 | "id": "9d_6DEs-aMDI"
183 | },
184 | "execution_count": null,
185 | "outputs": []
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "source": [
190 | "**VIDEO INTERPOLATION**"
191 | ],
192 | "metadata": {
193 | "id": "Ed2JbLwIMoYI"
194 | }
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "source": [
199 | "- Run and Upload your Video. If your video has 30 frames, then a **FRAME_MULTIPLIER** of 2 will double the number of frames to 60. If the video is 15fps, then using the default **FPS** (30) will give you a smoother output video at 30fps.The duration will remain the same. Increasing the **FPS** will reduce the duration of the video, resulting in a speedup effect. Reducing the **FPS** will increase the duration, resulting in a slow motion effect.\n",
200 | "- The **SCALE** parameter should be set to 0.5 for 4k videos (according to the authors of the Model).\n",
201 | "- Enabling **INCLUDE_AUDIO** will include the audio (from the input video) in the output video and the **FPS** parameter will not be used. The **FRAME_MULTIPLIER** will determine the fps of the output video with a value of 3 turning a 10fps video to a 30fps video with no change in video duration."
202 | ],
203 | "metadata": {
204 | "id": "erVrRUA6MyOZ"
205 | }
206 | },
207 | {
208 | "cell_type": "code",
209 | "source": [
210 | "# @title\n",
211 | "FRAME_MULTIPLIER = 2 # @param {\"type\":\"number\"}\n",
212 | "FPS = 30 # @param {\"type\":\"number\"}\n",
213 | "SCALE = 1 # @param {\"type\":\"number\"}\n",
214 | "INCLUDE_AUDIO = False # @param {\"type\":\"boolean\"}\n",
215 | "from google.colab import files\n",
216 | "import os\n",
217 | "import glob\n",
218 | "from IPython.display import display as displayVid, Video as outVid\n",
219 | "\n",
220 | "uploaded = files.upload()\n",
221 | "\n",
222 | "# Define target directory\n",
223 | "target_dir = \"/content/Practical-RIFE\"\n",
224 | "os.makedirs(target_dir, exist_ok=True)\n",
225 | "\n",
226 | "video_path = None\n",
227 | "for filename, file_data in uploaded.items():\n",
228 | " ext = os.path.splitext(filename)[1].lower()\n",
229 | " if ext in [\".mp4\", \".avi\", \".mov\", \".mkv\"]: # Ensure it's a valid video format\n",
230 | " video_path = os.path.join(target_dir, f\"uploaded_video{ext}\")\n",
231 | " with open(video_path, \"wb\") as f:\n",
232 | " f.write(file_data)\n",
233 | " break\n",
234 | "\n",
235 | "# Suppress ALSA errors\n",
236 | "os.environ[\"XDG_RUNTIME_DIR\"] = \"/tmp\"\n",
237 | "os.environ[\"SDL_AUDIODRIVER\"] = \"dummy\"\n",
238 | "\n",
239 | "# Disable warnings from ffmpeg about missing audio\n",
240 | "os.environ[\"PYGAME_HIDE_SUPPORT_PROMPT\"] = \"1\"\n",
241 | "os.environ[\"FFMPEG_LOGLEVEL\"] = \"quiet\"\n",
242 | "\n",
243 | "if INCLUDE_AUDIO:\n",
244 | " !python3 inference_video.py --multi={FRAME_MULTIPLIER} --video={video_path} --scale={SCALE}\n",
245 | "else:\n",
246 | " !python3 inference_video.py --multi={FRAME_MULTIPLIER} --fps={FPS} --video={video_path} --scale={SCALE}\n",
247 | "\n",
248 | "\n",
249 | "video_folder = \"/content/Practical-RIFE/\"\n",
250 | "\n",
251 | "# Find the latest MP4 file\n",
252 | "video_files = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
253 | "\n",
254 | "if video_files:\n",
255 | " latest_video = max(video_files, key=os.path.getctime)\n",
256 | " !ffmpeg -i \"{latest_video}\" -vcodec libx264 -crf 23 -preset fast output_converted.mp4 -loglevel error -y\n",
257 | "\n",
258 | " print(f\"Displaying video: {latest_video}\")\n",
259 | " displayVid(outVid(\"output_converted.mp4\", embed=True))\n",
260 | " # displayVid(outVid(latest_video, embed=True))\n",
261 | "else:\n",
262 | " print(\"❌ No video found in output/\")\n",
263 | "\n"
264 | ],
265 | "metadata": {
266 | "cellView": "form",
267 | "id": "L_4Hi6cFMsMG"
268 | },
269 | "execution_count": null,
270 | "outputs": []
271 | }
272 | ]
273 | }
--------------------------------------------------------------------------------
/Hidream_fp8.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "L4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **Hidream_fp8 for Text to Image Generation**\n",
23 | "- Choose one of the three HiDream model versions before setting up the environment.\n",
24 | "\n",
25 | "- Each model is 17.1GB, so make sure you have sufficient compute units (e.g., Colab Pro or equivalent) to run the notebook smoothly.\n",
26 | "\n",
27 | "- The notebook is divided into three sections for image generation, each pre-configured with the recommended settings for its corresponding model version."
28 | ],
29 | "metadata": {
30 | "id": "qOMU9X8Yi19t"
31 | }
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "cellView": "form",
38 | "id": "982f7zyAZzdc"
39 | },
40 | "outputs": [],
41 | "source": [
42 | "# @title Setup Environment\n",
43 | "%cd /content\n",
44 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
45 | "# !pip install -q torchsde einops diffusers accelerate xformers\n",
46 | "# !git clone https://github.com/comfyanonymous/ComfyUI\n",
47 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
48 | "%cd /content/ComfyUI\n",
49 | "# !apt -y install -qq aria2\n",
50 | "\n",
51 | "import subprocess\n",
52 | "import sys\n",
53 | "from IPython.display import clear_output\n",
54 | "\n",
55 | "def install_pip_packages():\n",
56 | " packages = [\n",
57 | " 'torchsde',\n",
58 | " # 'av',\n",
59 | " 'diffusers',\n",
60 | " # 'transformers',\n",
61 | " 'xformers',\n",
62 | " 'accelerate',\n",
63 | " # 'omegaconf',\n",
64 | " # 'tqdm',\n",
65 | " # 'librosa',\n",
66 | " 'einops'\n",
67 | " ]\n",
68 | "\n",
69 | " for package in packages:\n",
70 | " try:\n",
71 | " # Run pip install silently (using -q)\n",
72 | " subprocess.run(\n",
73 | " [sys.executable, '-m', 'pip', 'install', '-q', package],\n",
74 | " check=True,\n",
75 | " capture_output=True\n",
76 | " )\n",
77 | " print(f\"✓ {package} installed\")\n",
78 | " except subprocess.CalledProcessError as e:\n",
79 | " print(f\"✗ Error installing {package}: {e.stderr.decode().strip() or 'Unknown error'}\")\n",
80 | "\n",
81 | "def install_apt_packages():\n",
82 | " packages = ['aria2']\n",
83 | "\n",
84 | " try:\n",
85 | " # Run apt install silently (using -qq)\n",
86 | " subprocess.run(\n",
87 | " ['apt-get', '-y', 'install', '-qq'] + packages,\n",
88 | " check=True,\n",
89 | " capture_output=True\n",
90 | " )\n",
91 | " print(\"✓ apt packages installed\")\n",
92 | " except subprocess.CalledProcessError as e:\n",
93 | " print(f\"✗ Error installing apt packages: {e.stderr.decode().strip() or 'Unknown error'}\")\n",
94 | "\n",
95 | "# Run installations\n",
96 | "print(\"Installing pip packages...\")\n",
97 | "install_pip_packages()\n",
98 | "clear_output() # Clear the pip installation output\n",
99 | "\n",
100 | "print(\"Installing apt packages...\")\n",
101 | "install_apt_packages()\n",
102 | "clear_output() # Clear the apt installation output\n",
103 | "\n",
104 | "print(\"Installation completed with status:\")\n",
105 | "print(\"- All pip packages installed successfully\" if '✗' not in install_pip_packages.__code__.co_consts else \"- Some pip packages had issues\")\n",
106 | "print(\"- apt packages installed successfully\" if '✗' not in install_apt_packages.__code__.co_consts else \"- apt packages had issues\")\n",
107 | "\n",
108 | "import torch\n",
109 | "import numpy as np\n",
110 | "from PIL import Image\n",
111 | "import gc\n",
112 | "import os\n",
113 | "sys.path.insert(0, '/content/ComfyUI')\n",
114 | "\n",
115 | "from comfy import model_management\n",
116 | "\n",
117 | "from nodes import (\n",
118 | " KSampler,\n",
119 | " VAEDecode,\n",
120 | " VAELoader,\n",
121 | " CLIPTextEncode,\n",
122 | " SaveImage\n",
123 | ")\n",
124 | "\n",
125 | "from comfy_extras.nodes_model_advanced import ModelSamplingSD3\n",
126 | "from comfy_extras.nodes_sd3 import EmptySD3LatentImage\n",
127 | "from comfy_extras.nodes_hidream import QuadrupleCLIPLoader\n",
128 | "from nodes import UNETLoader\n",
129 | "\n",
130 | "from pathlib import Path\n",
131 | "\n",
132 | "def model_download(url: str, dest_dir: str, filename: str = None, silent: bool = True) -> bool:\n",
133 | " \"\"\"\n",
134 | " Colab-optimized download with aria2c\n",
135 | "\n",
136 | " Args:\n",
137 | " url: Download URL\n",
138 | " dest_dir: Target directory (will be created if needed)\n",
139 | " filename: Optional output filename (defaults to URL filename)\n",
140 | " silent: If True, suppresses all output (except errors)\n",
141 | "\n",
142 | " Returns:\n",
143 | " bool: True if successful, False if failed\n",
144 | " \"\"\"\n",
145 | " try:\n",
146 | " # Create destination directory\n",
147 | " Path(dest_dir).mkdir(parents=True, exist_ok=True)\n",
148 | "\n",
149 | " # Set filename if not specified\n",
150 | " if filename is None:\n",
151 | " filename = url.split('/')[-1].split('?')[0] # Remove URL parameters\n",
152 | "\n",
153 | " # Build command\n",
154 | " cmd = [\n",
155 | " 'aria2c',\n",
156 | " '--console-log-level=error',\n",
157 | " '-c', '-x', '16', '-s', '16', '-k', '1M',\n",
158 | " '-d', dest_dir,\n",
159 | " '-o', filename,\n",
160 | " url\n",
161 | " ]\n",
162 | "\n",
163 | " # Add silent flags if requested\n",
164 | " if silent:\n",
165 | " cmd.extend(['--summary-interval=0', '--quiet'])\n",
166 | " print(f\"Downloading {filename}...\", end=' ', flush=True)\n",
167 | "\n",
168 | " # Run download\n",
169 | " result = subprocess.run(cmd, check=True, capture_output=True, text=True)\n",
170 | "\n",
171 | " if silent:\n",
172 | " print(\"Done!\")\n",
173 | " else:\n",
174 | " print(f\"Downloaded {filename} to {dest_dir}\")\n",
175 | " return filename\n",
176 | "\n",
177 | " except subprocess.CalledProcessError as e:\n",
178 | " error = e.stderr.strip() or \"Unknown error\"\n",
179 | " print(f\"\\nError downloading {filename}: {error}\")\n",
180 | " return False\n",
181 | " except Exception as e:\n",
182 | " print(f\"\\nError: {str(e)}\")\n",
183 | " return False\n",
184 | "\n",
185 | "hidream_fp8_version = \"dev\" # @param [\"fast\", \"dev\", \"full\"]\n",
186 | "\n",
187 | "hidream_model = \"hidream_i1_dev_fp8.safetensors\"\n",
188 | "\n",
189 | "if hidream_fp8_version == \"fast\":\n",
190 | " hidream_model = model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/diffusion_models/hidream_i1_fast_fp8.safetensors\", \"/content/ComfyUI/models/diffusion_models\")\n",
191 | "elif hidream_fp8_version == \"full\":\n",
192 | " hidream_model = model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/diffusion_models/hidream_i1_full_fp8.safetensors\", \"/content/ComfyUI/models/diffusion_models\")\n",
193 | "else:\n",
194 | " model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/diffusion_models/hidream_i1_dev_fp8.safetensors\", \"/content/ComfyUI/models/diffusion_models\")\n",
195 | "\n",
196 | "model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/text_encoders/clip_g_hidream.safetensors\", \"/content/ComfyUI/models/text_encoders\")\n",
197 | "model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/text_encoders/clip_l_hidream.safetensors\", \"/content/ComfyUI/models/text_encoders\")\n",
198 | "model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/text_encoders/llama_3.1_8b_instruct_fp8_scaled.safetensors\", \"/content/ComfyUI/models/text_encoders\")\n",
199 | "model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/text_encoders/t5xxl_fp8_e4m3fn_scaled.safetensors\", \"/content/ComfyUI/models/text_encoders\")\n",
200 | "model_download(\"https://huggingface.co/Comfy-Org/HiDream-I1_ComfyUI/resolve/main/split_files/vae/ae.safetensors\", \"/content/ComfyUI/models/vae\")\n",
201 | "\n",
202 | "# Initialize nodes\n",
203 | "unet_loader = UNETLoader()\n",
204 | "model_sampling = ModelSamplingSD3()\n",
205 | "clip_loader = QuadrupleCLIPLoader()\n",
206 | "clip_encode_positive = CLIPTextEncode()\n",
207 | "clip_encode_negative = CLIPTextEncode()\n",
208 | "vae_loader = VAELoader()\n",
209 | "empty_latent_image = EmptySD3LatentImage()\n",
210 | "ksampler = KSampler()\n",
211 | "vae_decode = VAEDecode()\n",
212 | "save_image = SaveImage()\n",
213 | "\n",
214 | "def clear_memory():\n",
215 | " gc.collect()\n",
216 | " if torch.cuda.is_available():\n",
217 | " torch.cuda.empty_cache()\n",
218 | " torch.cuda.ipc_collect()\n",
219 | " for obj in list(globals().values()):\n",
220 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
221 | " del obj\n",
222 | " gc.collect()\n",
223 | "\n",
224 | "\n",
225 | "def generate_image(\n",
226 | " positive_prompt: str = \"anime girl with massive fennec ears and a big fluffy fox tail with long wavy blonde hair and blue eyes wearing a pink sweater a large oversized black winter coat and a long blue maxi skirt and large winter boots and a red scarf and large gloves sitting in a sled sledding fast down a snow mountain\",\n",
227 | " negative_prompt: str = \"bad ugly jpeg artifacts\",\n",
228 | " width: int = 1024,\n",
229 | " height: int = 1024,\n",
230 | " seed: int = 147638433643733,\n",
231 | " steps: int = 28,\n",
232 | " cfg_scale: float = 1.0,\n",
233 | " sampler_name: str = \"lcm\",\n",
234 | " scheduler: str = \"simple\",\n",
235 | " shift: float = 6.0\n",
236 | "):\n",
237 | " with torch.inference_mode():\n",
238 | " print(\"Loading CLIP models...\")\n",
239 | " clip = clip_loader.load_clip(\n",
240 | " \"clip_l_hidream.safetensors\",\n",
241 | " \"clip_g_hidream.safetensors\",\n",
242 | " \"t5xxl_fp8_e4m3fn_scaled.safetensors\",\n",
243 | " \"llama_3.1_8b_instruct_fp8_scaled.safetensors\"\n",
244 | " )[0]\n",
245 | "\n",
246 | " print(\"Encoding prompts...\")\n",
247 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
248 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
249 | "\n",
250 | " del clip\n",
251 | " torch.cuda.empty_cache()\n",
252 | " gc.collect()\n",
253 | "\n",
254 | " print(\"Creating empty latent...\")\n",
255 | " empty_latent = empty_latent_image.generate(width, height, 1)[0]\n",
256 | "\n",
257 | " print(\"Loading UNet model...\")\n",
258 | " model = unet_loader.load_unet(hidream_model, \"default\")[0]\n",
259 | " model = model_sampling.patch(model, shift)[0]\n",
260 | "\n",
261 | " print(\"Sampling...\")\n",
262 | " sampled = ksampler.sample(\n",
263 | " model=model,\n",
264 | " seed=seed,\n",
265 | " steps=steps,\n",
266 | " cfg=cfg_scale,\n",
267 | " sampler_name=sampler_name,\n",
268 | " scheduler=scheduler,\n",
269 | " positive=positive,\n",
270 | " negative=negative,\n",
271 | " latent_image=empty_latent\n",
272 | " )[0]\n",
273 | "\n",
274 | " del model\n",
275 | " torch.cuda.empty_cache()\n",
276 | " gc.collect()\n",
277 | "\n",
278 | " print(\"Loading VAE...\")\n",
279 | " vae = vae_loader.load_vae(\"ae.safetensors\")[0]\n",
280 | "\n",
281 | " try:\n",
282 | " print(\"Decoding image...\")\n",
283 | " decoded = vae_decode.decode(vae, sampled)[0]\n",
284 | "\n",
285 | " del vae\n",
286 | " torch.cuda.empty_cache()\n",
287 | " gc.collect()\n",
288 | "\n",
289 | " print(\"Saving image...\")\n",
290 | " output_path = save_image.save_images(decoded, \"ComfyUI\")[\"ui\"][\"images\"][0][\"filename\"]\n",
291 | " full_path = f\"/content/ComfyUI/output/{output_path}\"\n",
292 | "\n",
293 | " from IPython.display import display, Image\n",
294 | " display(Image(filename=full_path))\n",
295 | "\n",
296 | " return full_path\n",
297 | "\n",
298 | " except Exception as e:\n",
299 | " print(f\"Error during decoding/saving: {str(e)}\")\n",
300 | " raise\n",
301 | " finally:\n",
302 | " clear_memory()\n",
303 | "\n",
304 | "print(\"✅ Environment Setup Complete!\")\n",
305 | "\n",
306 | "# Example usage:\n",
307 | "# generate_image()"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "source": [
313 | "\n",
314 | "\n",
315 | "\n",
316 | "---\n",
317 | "\n",
318 | "\n",
319 | "\n",
320 | "---\n",
321 | "\n",
322 | "\n",
323 | "---\n",
324 | "\n"
325 | ],
326 | "metadata": {
327 | "id": "B48efDl35VBg"
328 | }
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "source": [
333 | "**Use this section if you selected `fast` as the hidream_fp8_version** in \"Setup Environment\""
334 | ],
335 | "metadata": {
336 | "id": "QB3AR9Zx-QVo"
337 | }
338 | },
339 | {
340 | "cell_type": "code",
341 | "source": [
342 | "# @title Generate Image (Fast Version)\n",
343 | "positive_prompt = \"anime girl with massive fennec ears and a big fluffy fox tail with long wavy blonde hair and blue eyes wearing a pink sweater a large oversized black winter coat and a long blue maxi skirt and large winter boots and a red scarf and large gloves sitting in a sled sledding fast down a snow mountain\" # @param {\"type\":\"string\"}\n",
344 | "negative_prompt = \"bad ugly jpeg artifacts \" # @param {\"type\":\"string\"}\n",
345 | "width = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
346 | "height = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
347 | "seed = 1000 # @param {\"type\":\"integer\"}\n",
348 | "steps = 16 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
349 | "cfg_scale = 1.0 # @param {\"type\":\"number\", \"min\":0.1, \"max\":20.0}\n",
350 | "sampler_name = \"lcm\" # @param [\"lcm\", \"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
351 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
352 | "shift = 3.0 # @param {\"type\":\"number\", \"min\":0.0, \"max\":10.0}\n",
353 | "\n",
354 | "import random\n",
355 | "seed = seed if seed != 0 else random.randint(0, 2**32 - 1)\n",
356 | "print(f\"Using seed: {seed}\")\n",
357 | "\n",
358 | "# Generate the image\n",
359 | "output_path = generate_image(\n",
360 | " positive_prompt=positive_prompt,\n",
361 | " negative_prompt=negative_prompt,\n",
362 | " width=width,\n",
363 | " height=height,\n",
364 | " seed=seed,\n",
365 | " steps=steps,\n",
366 | " cfg_scale=cfg_scale,\n",
367 | " sampler_name=sampler_name,\n",
368 | " scheduler=scheduler,\n",
369 | " shift=shift\n",
370 | ")\n"
371 | ],
372 | "metadata": {
373 | "cellView": "form",
374 | "id": "gHM0_hQFxWvB"
375 | },
376 | "execution_count": null,
377 | "outputs": []
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "source": [
382 | "\n",
383 | "\n",
384 | "\n",
385 | "---\n",
386 | "\n",
387 | "---\n",
388 | "\n",
389 | "\n",
390 | "\n",
391 | "---\n",
392 | "\n",
393 | "\n",
394 | "\n",
395 | "\n",
396 | "\n"
397 | ],
398 | "metadata": {
399 | "id": "KKFt5NKo5Zrn"
400 | }
401 | },
402 | {
403 | "cell_type": "markdown",
404 | "source": [
405 | "**Use this section if you selected `dev` as the hidream_fp8_version** in \"Setup Environment\""
406 | ],
407 | "metadata": {
408 | "id": "D3cquHaK_WN7"
409 | }
410 | },
411 | {
412 | "cell_type": "code",
413 | "source": [
414 | "# @title Generate Image (Dev Version)\n",
415 | "positive_prompt = \"anime girl with massive fennec ears and a big fluffy fox tail with long wavy blonde hair and blue eyes wearing a pink sweater a large oversized black winter coat and a long blue maxi skirt and large winter boots and a red scarf and large gloves sitting in a sled sledding fast down a snow mountain\" # @param {\"type\":\"string\"}\n",
416 | "negative_prompt = \"bad ugly jpeg artifacts \" # @param {\"type\":\"string\"}\n",
417 | "width = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
418 | "height = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
419 | "seed = 1000 # @param {\"type\":\"integer\"}\n",
420 | "steps = 28 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
421 | "cfg_scale = 1.0 # @param {\"type\":\"number\", \"min\":0.1, \"max\":20.0}\n",
422 | "sampler_name = \"lcm\" # @param [\"lcm\", \"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
423 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
424 | "shift = 6.0 # @param {\"type\":\"number\", \"min\":0.0, \"max\":10.0}\n",
425 | "\n",
426 | "import random\n",
427 | "seed = seed if seed != 0 else random.randint(0, 2**32 - 1)\n",
428 | "print(f\"Using seed: {seed}\")\n",
429 | "\n",
430 | "# Generate the image\n",
431 | "output_path = generate_image(\n",
432 | " positive_prompt=positive_prompt,\n",
433 | " negative_prompt=negative_prompt,\n",
434 | " width=width,\n",
435 | " height=height,\n",
436 | " seed=seed,\n",
437 | " steps=steps,\n",
438 | " cfg_scale=cfg_scale,\n",
439 | " sampler_name=sampler_name,\n",
440 | " scheduler=scheduler,\n",
441 | " shift=shift\n",
442 | ")\n"
443 | ],
444 | "metadata": {
445 | "collapsed": true,
446 | "cellView": "form",
447 | "id": "C-Mt6P2DZ_U9"
448 | },
449 | "execution_count": null,
450 | "outputs": []
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "source": [
455 | "\n",
456 | "\n",
457 | "---\n",
458 | "\n",
459 | "\n",
460 | "\n",
461 | "---\n",
462 | "\n",
463 | "\n",
464 | "\n",
465 | "---\n",
466 | "\n"
467 | ],
468 | "metadata": {
469 | "id": "p7xJ_nSk55X6"
470 | }
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "source": [
475 | "**Use this section if you selected `full` as the hidream_fp8_version** in \"Setup Environment\""
476 | ],
477 | "metadata": {
478 | "id": "9K7B3Cdy_jrJ"
479 | }
480 | },
481 | {
482 | "cell_type": "code",
483 | "source": [
484 | "# @title Generate Image (Full Version)\n",
485 | "positive_prompt = \"anime girl with massive fennec ears and a big fluffy fox tail with long wavy blonde hair and blue eyes wearing a pink sweater a large oversized black winter coat and a long blue maxi skirt and large winter boots and a red scarf and large gloves sitting in a sled sledding fast down a snow mountain\" # @param {\"type\":\"string\"}\n",
486 | "negative_prompt = \"bad ugly jpeg artifacts \" # @param {\"type\":\"string\"}\n",
487 | "width = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
488 | "height = 1024 # @param {\"type\":\"integer\", \"min\":512, \"max\":2048}\n",
489 | "seed = 1000 # @param {\"type\":\"integer\"}\n",
490 | "steps = 50 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
491 | "cfg_scale = 5.0 # @param {\"type\":\"number\", \"min\":0.1, \"max\":20.0}\n",
492 | "sampler_name = \"lcm\" # @param [\"lcm\", \"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
493 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
494 | "shift = 3.0 # @param {\"type\":\"number\", \"min\":0.0, \"max\":10.0}\n",
495 | "\n",
496 | "import random\n",
497 | "seed = seed if seed != 0 else random.randint(0, 2**32 - 1)\n",
498 | "print(f\"Using seed: {seed}\")\n",
499 | "\n",
500 | "# Generate the image\n",
501 | "output_path = generate_image(\n",
502 | " positive_prompt=positive_prompt,\n",
503 | " negative_prompt=negative_prompt,\n",
504 | " width=width,\n",
505 | " height=height,\n",
506 | " seed=seed,\n",
507 | " steps=steps,\n",
508 | " cfg_scale=cfg_scale,\n",
509 | " sampler_name=sampler_name,\n",
510 | " scheduler=scheduler,\n",
511 | " shift=shift\n",
512 | ")\n"
513 | ],
514 | "metadata": {
515 | "cellView": "form",
516 | "id": "WlCU9T4R0bu5"
517 | },
518 | "execution_count": null,
519 | "outputs": []
520 | }
521 | ]
522 | }
--------------------------------------------------------------------------------
/LTX_Video_Img_to_Vid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **LTX-VIDEO (Image to Video based on ComfyUI nodes library)**\n",
23 | "ComfyUI Github Repository: https://github.com/comfyanonymous/ComfyUI"
24 | ],
25 | "metadata": {
26 | "id": "f4p1ysFKMbs_"
27 | }
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "- Note that this Notebook only references the ComfyUI nodes library; it does not display the ComfyUI GUI.\n",
33 | "- You can use the free T4 GPU to run this depending on the output video resolution and number of frames. The default setting runs without issues, but at 768 by 512 output resolution with 73 frames, the decoding process crashes the 12.7GB RAM. For faster video generation with higher resolutions and frames, use higher GPUs.\n",
34 | "- If you want to generate a video with n frames, then set frames to n+1. e.g. To generate a video with 72 frames, set frames to 73.\n",
35 | "- You need to use detailed prompts to get decent results.\n",
36 | "- Videos are generated at 24fps."
37 | ],
38 | "metadata": {
39 | "id": "EBB00lC6q-DA"
40 | }
41 | },
42 | {
43 | "cell_type": "code",
44 | "source": [
45 | "# @title Prepare Environment\n",
46 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
47 | "%cd /content\n",
48 | "Always_Load_Models_for_Inference = False\n",
49 | "Use_t5xxl_fp16 = False\n",
50 | "\n",
51 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
52 | "!pip install av\n",
53 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
54 | "%cd /content/ComfyUI\n",
55 | "!apt -y install -qq aria2 ffmpeg\n",
56 | "\n",
57 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors\n",
58 | "if Use_t5xxl_fp16:\n",
59 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors\n",
60 | "else:\n",
61 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors\n",
62 | "\n",
63 | "import torch\n",
64 | "import numpy as np\n",
65 | "from PIL import Image\n",
66 | "import gc\n",
67 | "import sys\n",
68 | "import random\n",
69 | "import os\n",
70 | "import imageio\n",
71 | "from google.colab import files\n",
72 | "from IPython.display import display, HTML\n",
73 | "sys.path.insert(0, '/content/ComfyUI')\n",
74 | "\n",
75 | "from comfy import model_management\n",
76 | "\n",
77 | "from nodes import (\n",
78 | " CheckpointLoaderSimple,\n",
79 | " CLIPLoader,\n",
80 | " CLIPTextEncode,\n",
81 | " VAEDecode,\n",
82 | " LoadImage,\n",
83 | " SaveImage\n",
84 | ")\n",
85 | "\n",
86 | "from comfy_extras.nodes_custom_sampler import (\n",
87 | " KSamplerSelect,\n",
88 | " SamplerCustom\n",
89 | ")\n",
90 | "\n",
91 | "from comfy_extras.nodes_lt import (\n",
92 | " LTXVPreprocess,\n",
93 | " LTXVImgToVideo,\n",
94 | " LTXVScheduler,\n",
95 | " LTXVConditioning\n",
96 | ")\n",
97 | "\n",
98 | "checkpoint_loader = CheckpointLoaderSimple()\n",
99 | "clip_loader = CLIPLoader()\n",
100 | "clip_encode_positive = CLIPTextEncode()\n",
101 | "clip_encode_negative = CLIPTextEncode()\n",
102 | "load_image = LoadImage()\n",
103 | "save_node = SaveImage()\n",
104 | "preprocess = LTXVPreprocess()\n",
105 | "img_to_video = LTXVImgToVideo()\n",
106 | "scheduler = LTXVScheduler()\n",
107 | "sampler_select = KSamplerSelect()\n",
108 | "conditioning = LTXVConditioning()\n",
109 | "sampler = SamplerCustom()\n",
110 | "vae_decode = VAEDecode()\n",
111 | "\n",
112 | "# if not Always_Load_Models_for_Inference:\n",
113 | "# with torch.inference_mode():\n",
114 | "# # Load models\n",
115 | "# print(\"Loading Model...\")\n",
116 | "# model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
117 | "# print(\"Loaded model!\")\n",
118 | "# print(\"Loading Text_Encoder...\")\n",
119 | "# # if Use_t5xxl_fp16:\n",
120 | "# # clip = clip_loader.load_clip(\"t5xxl_fp16.safetensors\", \"ltxv\", \"default\")[0]\n",
121 | "# # else:\n",
122 | "# clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
123 | "# print(\"Loaded Text_Encoder!\")\n",
124 | "\n",
125 | "def clear_gpu_memory():\n",
126 | " gc.collect()\n",
127 | " if torch.cuda.is_available():\n",
128 | " torch.cuda.empty_cache()\n",
129 | " torch.cuda.ipc_collect()\n",
130 | " for obj in list(globals().values()):\n",
131 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
132 | " del obj\n",
133 | "\n",
134 | " gc.collect()\n",
135 | "\n",
136 | "\n",
137 | "def upload_image():\n",
138 | " \"\"\"Handle image upload in Colab and store in /content/ComfyUI/input/\"\"\"\n",
139 | " from google.colab import files\n",
140 | " import os\n",
141 | " import shutil\n",
142 | "\n",
143 | " os.makedirs('/content/ComfyUI/input', exist_ok=True)\n",
144 | "\n",
145 | " uploaded = files.upload()\n",
146 | "\n",
147 | " # Move each uploaded file to ComfyUI input directory\n",
148 | " for filename in uploaded.keys():\n",
149 | " src_path = f'/content/ComfyUI/{filename}'\n",
150 | " dest_path = f'/content/ComfyUI/input/{filename}'\n",
151 | "\n",
152 | " shutil.move(src_path, dest_path)\n",
153 | " print(f\"Image saved to: {dest_path}\")\n",
154 | " return dest_path\n",
155 | "\n",
156 | " return None\n",
157 | "\n",
158 | "\n",
159 | "def generate_video(\n",
160 | " image_path: str = None,\n",
161 | " positive_prompt: str = \"A red fox moving gracefully\",\n",
162 | " negative_prompt: str = \"low quality, worst quality\",\n",
163 | " width: int = 768,\n",
164 | " height: int = 512,\n",
165 | " seed: int = 0,\n",
166 | " steps: int = 30,\n",
167 | " cfg_scale: float = 2.05,\n",
168 | " sampler_name: str = \"euler\",\n",
169 | " length: int = 24, # Number of frames\n",
170 | " fps: int = 24\n",
171 | "):\n",
172 | " with torch.inference_mode():\n",
173 | " print(\"Loading Text_Encoder...\")\n",
174 | " clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
175 | " print(\"Loaded Text_Encoder!\")\n",
176 | " try:\n",
177 | "\n",
178 | " # if Always_Load_Models_for_Inference:\n",
179 | " # with torch.inference_mode():\n",
180 | " # # Load models\n",
181 | " # print(\"Loading Model...\")\n",
182 | " # model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
183 | " # print(\"Loaded model!\")\n",
184 | " # print(\"Loading Text_Encoder...\")\n",
185 | " # clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
186 | " # print(\"Loaded Text_Encoder!\")\n",
187 | "\n",
188 | " assert width % 32 == 0, \"Width must be divisible by 32\"\n",
189 | " assert height % 32 == 0, \"Height must be divisible by 32\"\n",
190 | "\n",
191 | "\n",
192 | "\n",
193 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
194 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
195 | "\n",
196 | " del clip\n",
197 | " torch.cuda.empty_cache()\n",
198 | " gc.collect()\n",
199 | " print(\"Text_Encoder removed from memory\")\n",
200 | "\n",
201 | " if image_path is None:\n",
202 | " print(\"Please upload an image file:\")\n",
203 | " image_path = upload_image()\n",
204 | " if image_path is None:\n",
205 | " print(\"No image uploaded!\")\n",
206 | " loaded_image = load_image.load_image(image_path)[0]\n",
207 | " processed_image = preprocess.preprocess(loaded_image, 40)[0]\n",
208 | "\n",
209 | " print(\"Loading model & VAE...\")\n",
210 | " model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
211 | " print(\"Loaded model & VAE!\")\n",
212 | "\n",
213 | " video_output = img_to_video.generate(\n",
214 | " positive=positive,\n",
215 | " negative=negative,\n",
216 | " vae=vae,\n",
217 | " image=processed_image,\n",
218 | " width=width,\n",
219 | " height=height,\n",
220 | " length=length,\n",
221 | " batch_size=1\n",
222 | " )\n",
223 | "\n",
224 | " sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]\n",
225 | " selected_sampler = sampler_select.get_sampler(sampler_name)[0]\n",
226 | " conditioned = conditioning.append(video_output[0], video_output[1], 25.0)\n",
227 | "\n",
228 | " print(\"Generating video...\")\n",
229 | "\n",
230 | " sampled = sampler.sample(\n",
231 | " model=model,\n",
232 | " add_noise=True,\n",
233 | " noise_seed=seed if seed != 0 else random.randint(0, 2**32),\n",
234 | " cfg=cfg_scale,\n",
235 | " positive=conditioned[0],\n",
236 | " negative=conditioned[1],\n",
237 | " sampler=selected_sampler,\n",
238 | " sigmas=sigmas,\n",
239 | " latent_image=video_output[2]\n",
240 | " )[0]\n",
241 | "\n",
242 | " # model_management.soft_empty_cache()\n",
243 | " del model\n",
244 | " torch.cuda.empty_cache()\n",
245 | " gc.collect()\n",
246 | " print(\"Model removed from memory\")\n",
247 | "\n",
248 | " with torch.no_grad():\n",
249 | " try:\n",
250 | " print(\"Decodimg Latents...\")\n",
251 | " decoded = vae_decode.decode(vae, sampled)[0].detach()\n",
252 | " # print(f\"Decoded frames shape: {decoded.shape}\")\n",
253 | " print(\"Latents Decoded!\")\n",
254 | " del vae\n",
255 | " torch.cuda.empty_cache()\n",
256 | " gc.collect()\n",
257 | " print(\"VAE removed from memory\")\n",
258 | " except Exception as e:\n",
259 | " print(f\"Error during decoding: {str(e)}\")\n",
260 | " raise\n",
261 | "\n",
262 | " # Reshape to video frames (batch, frames, H, W, C)\n",
263 | " # decoded_frames = decoded.reshape(1, length, height, width, 3)\n",
264 | "\n",
265 | " save_node.save_images(decoded, filename_prefix=\"video_frame\")\n",
266 | "\n",
267 | " output_path = \"/content/output.mp4\"\n",
268 | " frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)\n",
269 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
270 | " for frame in frames_np:\n",
271 | " writer.append_data(frame)\n",
272 | "\n",
273 | " print(f\"\\nVideo generation complete!\")\n",
274 | " print(f\"Saved {len(decoded)} frames to ComfyUI output directory\")\n",
275 | " print(f\"Video saved to: {output_path}\")\n",
276 | " display_video(output_path)\n",
277 | "\n",
278 | " except Exception as e:\n",
279 | " print(f\"Error during video generation: {str(e)}\")\n",
280 | " raise\n",
281 | " finally:\n",
282 | " clear_gpu_memory()\n",
283 | "\n",
284 | "\n",
285 | "def display_video(video_path):\n",
286 | " \"\"\"Display video in Colab notebook with proper HTML5 player\"\"\"\n",
287 | " from IPython.display import HTML\n",
288 | " from base64 import b64encode\n",
289 | "\n",
290 | " mp4 = open(video_path,'rb').read()\n",
291 | " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
292 | "\n",
293 | " display(HTML(f\"\"\"\n",
294 | " \n",
297 | " \"\"\"))"
298 | ],
299 | "metadata": {
300 | "id": "rrXFIT4fMfyJ"
301 | },
302 | "execution_count": null,
303 | "outputs": []
304 | },
305 | {
306 | "cell_type": "code",
307 | "source": [
308 | "# @title Run Image to Video\n",
309 | "positive_prompt = \"The woman walks forward towards the camera and smiles.\" # @param {\"type\":\"string\"}\n",
310 | "negative_prompt = \"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly\" # @param {\"type\":\"string\"}\n",
311 | "width = 768 # @param {\"type\":\"number\"}\n",
312 | "height = 512 # @param {\"type\":\"number\"}\n",
313 | "seed = 1000 # @param {\"type\":\"integer\"}\n",
314 | "steps = 20 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
315 | "cfg_scale = 2.5 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
316 | "sampler_name = \"euler\" # @param [\"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
317 | "frames = 73 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
318 | "\n",
319 | "# @title Run Video Generation\n",
320 | "print(\"Starting video generation workflow...\")\n",
321 | "with torch.inference_mode():\n",
322 | " generate_video(\n",
323 | " image_path=None, # This will trigger upload\n",
324 | " positive_prompt=positive_prompt,\n",
325 | " negative_prompt=negative_prompt,\n",
326 | " width=width,\n",
327 | " height=height,\n",
328 | " seed=seed,\n",
329 | " steps=steps,\n",
330 | " cfg_scale=cfg_scale,\n",
331 | " sampler_name=sampler_name,\n",
332 | " length=frames\n",
333 | " )\n",
334 | "clear_gpu_memory()"
335 | ],
336 | "metadata": {
337 | "cellView": "form",
338 | "id": "roC59_oNNflb"
339 | },
340 | "execution_count": null,
341 | "outputs": []
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "source": [
346 | "********************************************************************************************************************************************************************************************************************************************************************************************************************************"
347 | ],
348 | "metadata": {
349 | "id": "yWSMPSVcbmmn"
350 | }
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "source": [
355 | "********************************************************************************************************************************************************************************************************************************************************************************************************************************"
356 | ],
357 | "metadata": {
358 | "id": "FGDof1EkbnHv"
359 | }
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "source": [
364 | "# **LTX-VIDEO (Image to Video based on Lightricks LTX-VIDEO Github Repository)**\n",
365 | "LTX-Video Github Repository: https://github.com/Lightricks/LTX-Video"
366 | ],
367 | "metadata": {
368 | "id": "6t7--x3NBdE5"
369 | }
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "source": [
374 | "- You need compute units to run this section.\n",
375 | "- Use detailed prompts to improve the generated video.\n",
376 | "- If you want to generate a video with n frames, then set NUM_FRAMES to n+1. e.g. To generate a video with 120 frames, set NUM_FRAMES to 121.\n",
377 | "- Videos are generated at 24fps.\n"
378 | ],
379 | "metadata": {
380 | "id": "KVykpe_nU7lK"
381 | }
382 | },
383 | {
384 | "cell_type": "code",
385 | "source": [
386 | "# @title Prepare Environment\n",
387 | "# Install dependencies\n",
388 | "!git clone https://github.com/Isi-dev/LTX-Video.git\n",
389 | "%cd LTX-Video\n",
390 | "\n",
391 | "# Install required packages\n",
392 | "!pip install -e \".[inference-script]\"\n",
393 | "\n",
394 | "!pip install \"huggingface_hub[cli]\"\n",
395 | "!apt-get install -y aria2\n",
396 | "import os\n",
397 | "from huggingface_hub import list_repo_files\n",
398 | "\n",
399 | "repo_id = \"Isi99999/LTX-Video\"\n",
400 | "all_files = list_repo_files(repo_id)\n",
401 | "base_url = f\"https://huggingface.co/{repo_id}/resolve/main/\"\n",
402 | "\n",
403 | "with open(\"file_list.txt\", \"w\") as f:\n",
404 | " for file_path in all_files:\n",
405 | " full_url = f\"{base_url}{file_path}\"\n",
406 | " save_path = f\"MODEL_DIR/{file_path}\"\n",
407 | " os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
408 | " f.write(f\"{full_url}\\n out={save_path}\\n\")\n",
409 | "!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false\n",
410 | "\n",
411 | "print(\"✅ All models downloaded successfully!\")"
412 | ],
413 | "metadata": {
414 | "cellView": "form",
415 | "id": "S9doZlq9B36X"
416 | },
417 | "execution_count": null,
418 | "outputs": []
419 | },
420 | {
421 | "cell_type": "code",
422 | "source": [
423 | "# @title Upload Image\n",
424 | "from google.colab import files\n",
425 | "from PIL import Image\n",
426 | "\n",
427 | "uploaded = files.upload()\n",
428 | "image_path = list(uploaded.keys())[0]\n",
429 | "image = Image.open(image_path)\n",
430 | "print(\"✅Image loaded successfully:\", image.size)"
431 | ],
432 | "metadata": {
433 | "cellView": "form",
434 | "id": "QH2FBr4naeK2"
435 | },
436 | "execution_count": null,
437 | "outputs": []
438 | },
439 | {
440 | "cell_type": "code",
441 | "source": [
442 | "# @title Generate Video\n",
443 | "PROMPT =\"A red fox moving gracefully, its russet coat vibrant against the white landscape, leaving perfect star-shaped prints behind as steam rises from its breath in the crisp winter air. The scene is wrapped in snow-muffled silence, broken only by the gentle murmur of water still flowing beneath the ice.\" # @param {type:\"string\"}\n",
444 | "STEPS = 20 # @param {\"type\":\"number\"}\n",
445 | "Instruction_1 = \"choose from '720*1280', '1280*720', '480*832', '832*480', '480*704', '704*480' for width & height, and your input image should be of the same resolution as your selected width & height.\" # @param {\"type\":\"string\"}\n",
446 | "WIDTH = 704 # @param {\"type\":\"number\"}\n",
447 | "HEIGHT = 480 # @param {\"type\":\"number\"}\n",
448 | "Instruction_2 = \"The NUM_FRAMES should not exceed 257.\" # @param {\"type\":\"string\"}\n",
449 | "NUM_FRAMES = 121 # @param {\"type\":\"number\"}\n",
450 | "SEED = 1000 # @param {\"type\":\"number\"}\n",
451 | "\n",
452 | "\n",
453 | "total_vram = 0\n",
454 | "import torch\n",
455 | "if torch.cuda.is_available():\n",
456 | " gpu_id = torch.cuda.current_device()\n",
457 | " total_vram = torch.cuda.get_device_properties(gpu_id).total_memory / 1024**3\n",
458 | "else:\n",
459 | " print(\"No GPU found.\")\n",
460 | "if total_vram < 18:\n",
461 | " print(\"It seems you are using the free T4 GPU which is offered with a RAM of 12.7GB. The text encoder will crash the RAM. Choose a higher runtime type.\")\n",
462 | "elif total_vram > 18 and total_vram < 30:\n",
463 | " print(\"Setting low_vram flag to avoid Out of Memory Errors. Inference will be a bit slow.\")\n",
464 | " !python inference.py --ckpt_path \"MODEL_DIR/\" --output_path \"outputVidFromImage\" --low_vram --offload_to_cpu --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path \"MODEL_DIR/\" --prompt \"{PROMPT}\" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}\n",
465 | "else :\n",
466 | " !python inference.py --ckpt_path \"MODEL_DIR/\" --output_path \"outputVidFromImage\" --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path \"MODEL_DIR/\" --prompt \"{PROMPT}\" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}\n",
467 | "\n",
468 | "if total_vram > 18:\n",
469 | " import os\n",
470 | " import glob\n",
471 | " from IPython.display import display as displayVid, Video as outVid\n",
472 | "\n",
473 | " video_folder = \"outputVidFromImage/\"\n",
474 | "\n",
475 | " # Find the latest MP4 file\n",
476 | " video_files = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
477 | "\n",
478 | " if video_files:\n",
479 | " latest_video = max(video_files, key=os.path.getctime) # Get the most recent video\n",
480 | " print(f\"Displaying video: {latest_video}\")\n",
481 | " displayVid(outVid(latest_video, embed=True))\n",
482 | " else:\n",
483 | " print(\"❌ No video found in outputVid/\")\n"
484 | ],
485 | "metadata": {
486 | "cellView": "form",
487 | "id": "RHFnir7waoKm"
488 | },
489 | "execution_count": null,
490 | "outputs": []
491 | }
492 | ]
493 | }
--------------------------------------------------------------------------------
/LTX_Video_Tx_to_Vid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **LTX-VIDEO Text to Video**"
23 | ],
24 | "metadata": {
25 | "id": "f4p1ysFKMbs_"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "- You can use the free T4 GPU to run this depending on the output video resolution and number of frames. The default setting runs without issues, but at 768 by 512 output resolution with 121 frames, the decoding process crashes the 12.7GB RAM. For faster video generation with higher resolutions and frames, use higher GPUs.\n",
32 | "- If you want to generate a video with n frames, then set frames to n+1. e.g. To generate a video with 72 frames, set frames to 73.\n",
33 | "- You need to use detailed prompts to get decent results.\n",
34 | "- Videos are generated at 24fps."
35 | ],
36 | "metadata": {
37 | "id": "EBB00lC6q-DA"
38 | }
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "# @title Prepare Environment\n",
44 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
45 | "%cd /content\n",
46 | "Always_Load_Models_for_Inference = False\n",
47 | "Use_t5xxl_fp16 = False\n",
48 | "# Install dependencies\n",
49 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
50 | "!pip install av\n",
51 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
52 | "%cd /content/ComfyUI\n",
53 | "!apt -y install -qq aria2 ffmpeg\n",
54 | "\n",
55 | "# Download required models\n",
56 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors\n",
57 | "if Use_t5xxl_fp16:\n",
58 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors\n",
59 | "else:\n",
60 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors\n",
61 | "\n",
62 | "# Initial setup\n",
63 | "import torch\n",
64 | "import numpy as np\n",
65 | "from PIL import Image\n",
66 | "import gc\n",
67 | "import sys\n",
68 | "import random\n",
69 | "import os\n",
70 | "import imageio\n",
71 | "from google.colab import files\n",
72 | "from IPython.display import display, HTML\n",
73 | "sys.path.insert(0, '/content/ComfyUI')\n",
74 | "\n",
75 | "from comfy import model_management\n",
76 | "\n",
77 | "from nodes import (\n",
78 | " CheckpointLoaderSimple,\n",
79 | " CLIPLoader,\n",
80 | " CLIPTextEncode,\n",
81 | " VAEDecode\n",
82 | ")\n",
83 | "\n",
84 | "from comfy_extras.nodes_custom_sampler import (\n",
85 | " KSamplerSelect,\n",
86 | " SamplerCustom\n",
87 | ")\n",
88 | "\n",
89 | "from comfy_extras.nodes_lt import (\n",
90 | " LTXVConditioning,\n",
91 | " LTXVScheduler,\n",
92 | " EmptyLTXVLatentVideo\n",
93 | ")\n",
94 | "\n",
95 | "checkpoint_loader = CheckpointLoaderSimple()\n",
96 | "clip_loader = CLIPLoader()\n",
97 | "clip_encode_positive = CLIPTextEncode()\n",
98 | "clip_encode_negative = CLIPTextEncode()\n",
99 | "scheduler = LTXVScheduler()\n",
100 | "sampler_select = KSamplerSelect()\n",
101 | "conditioning = LTXVConditioning()\n",
102 | "empty_latent_video = EmptyLTXVLatentVideo()\n",
103 | "sampler = SamplerCustom()\n",
104 | "vae_decode = VAEDecode()\n",
105 | "\n",
106 | "# if not Always_Load_Models_for_Inference:\n",
107 | "# with torch.inference_mode():\n",
108 | "# # Load models\n",
109 | "# print(\"Loading Model...\")\n",
110 | "# model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
111 | "# print(\"Loaded model!\")\n",
112 | "# # print(\"Loading Text_Encoder...\")\n",
113 | "# # clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
114 | "# # print(\"Loaded Text_Encoder!\")\n",
115 | "\n",
116 | "\n",
117 | "def clear_memory():\n",
118 | " \"\"\"Frees GPU (VRAM) and CPU RAM memory.\"\"\"\n",
119 | " gc.collect()\n",
120 | " if torch.cuda.is_available():\n",
121 | " torch.cuda.empty_cache()\n",
122 | " torch.cuda.ipc_collect()\n",
123 | " for obj in list(globals().values()):\n",
124 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
125 | " del obj\n",
126 | "\n",
127 | " gc.collect()\n",
128 | "\n",
129 | "def generate_video(\n",
130 | " positive_prompt: str = \"A drone quickly rises through a bank of morning fog...\",\n",
131 | " negative_prompt: str = \"low quality, worst quality...\",\n",
132 | " width: int = 768,\n",
133 | " height: int = 512,\n",
134 | " seed: int = 0,\n",
135 | " steps: int = 30,\n",
136 | " cfg_scale: float = 2.05,\n",
137 | " sampler_name: str = \"res_multistep\",\n",
138 | " length: int = 49,\n",
139 | " fps: int = 24\n",
140 | "):\n",
141 | "\n",
142 | " with torch.inference_mode():\n",
143 | " print(\"Loading Text_Encoder...\")\n",
144 | " clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
145 | " print(\"Loaded Text_Encoder!\")\n",
146 | "\n",
147 | " try:\n",
148 | " assert width % 32 == 0, \"Width must be divisible by 32\"\n",
149 | " assert height % 32 == 0, \"Height must be divisible by 32\"\n",
150 | "\n",
151 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
152 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
153 | "\n",
154 | " del clip\n",
155 | " torch.cuda.empty_cache()\n",
156 | " gc.collect()\n",
157 | " print(\"Text_Encoder removed from memory\")\n",
158 | "\n",
159 | " empty_latent = empty_latent_video.generate(width, height, length)[0]\n",
160 | "\n",
161 | " sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]\n",
162 | " selected_sampler = sampler_select.get_sampler(sampler_name)[0]\n",
163 | " conditioned = conditioning.append(positive, negative, 25.0)\n",
164 | "\n",
165 | " print(\"Loading model & VAE...\")\n",
166 | " model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
167 | " print(\"Loaded model & VAE!\")\n",
168 | "\n",
169 | " print(\"Generating video...\")\n",
170 | " sampled = sampler.sample(\n",
171 | " model=model,\n",
172 | " add_noise=True,\n",
173 | " noise_seed=seed if seed != 0 else random.randint(0, 2**32),\n",
174 | " cfg=cfg_scale,\n",
175 | " positive=conditioned[0],\n",
176 | " negative=conditioned[1],\n",
177 | " sampler=selected_sampler,\n",
178 | " sigmas=sigmas,\n",
179 | " latent_image=empty_latent\n",
180 | " )[0]\n",
181 | "\n",
182 | " del model\n",
183 | " torch.cuda.empty_cache()\n",
184 | " gc.collect()\n",
185 | " print(\"Model removed from memory\")\n",
186 | "\n",
187 | " with torch.no_grad():\n",
188 | " try:\n",
189 | " print(\"Decodimg Latents...\")\n",
190 | " decoded = vae_decode.decode(vae, sampled)[0].detach()\n",
191 | " print(\"Latents Decoded!\")\n",
192 | " del vae\n",
193 | " torch.cuda.empty_cache()\n",
194 | " gc.collect()\n",
195 | " print(\"VAE removed from memory\")\n",
196 | "\n",
197 | " output_path = \"/content/output.mp4\"\n",
198 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
199 | " for i, frame in enumerate(decoded):\n",
200 | " frame_np = (frame.cpu().numpy() * 255).astype(np.uint8)\n",
201 | " writer.append_data(frame_np)\n",
202 | " if i % 10 == 0: # Periodic cleanup\n",
203 | " torch.cuda.empty_cache()\n",
204 | "\n",
205 | " print(f\"Successfully processed {len(decoded)} frames\")\n",
206 | "\n",
207 | "\n",
208 | " except Exception as e:\n",
209 | " print(f\"Decoding error: {str(e)}\")\n",
210 | " raise\n",
211 | "\n",
212 | " print(\"Displaying Video...\")\n",
213 | " display_video(output_path)\n",
214 | "\n",
215 | " except Exception as e:\n",
216 | " print(f\"Video generation failed: {str(e)}\")\n",
217 | " raise\n",
218 | " finally:\n",
219 | " clear_memory()\n",
220 | "\n",
221 | "def display_video(video_path):\n",
222 | " from IPython.display import HTML\n",
223 | " from base64 import b64encode\n",
224 | "\n",
225 | " mp4 = open(video_path,'rb').read()\n",
226 | " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
227 | "\n",
228 | " display(HTML(f\"\"\"\n",
229 | " \n",
232 | " \"\"\"))\n",
233 | "\n",
234 | "print(\"✅ Environment Setup Complete!\")"
235 | ],
236 | "metadata": {
237 | "cellView": "form",
238 | "id": "rrXFIT4fMfyJ"
239 | },
240 | "execution_count": null,
241 | "outputs": []
242 | },
243 | {
244 | "cell_type": "code",
245 | "source": [
246 | "# @title Video Generation Parameters\n",
247 | "# example_prompt = \"A cinematic aerial view from a slowly moving drone, capturing breathtaking landscapes. The camera smoothly glides over rolling green hills, vast forests, and shimmering lakes, bathed in the golden light of sunrise. Mist gently rises from the valleys, creating a dreamy atmosphere. The drone moves gracefully, revealing majestic mountain peaks in the distance, with soft clouds drifting by. Rivers weave through the terrain like silver threads, and vibrant wildflowers dot the fields. The scene is immersive, evoking a sense of wonder and tranquility.\" # @param {\"type\":\"string\"}\n",
248 | "positive_prompt = \"A drone quickly rises through a bank of morning fog, revealing a pristine alpine lake surrounded by snow-capped mountains. The camera glides forward over the glassy water, capturing perfect reflections of the peaks. As it continues, the perspective shifts to reveal a lone wooden cabin with a curl of smoke from its chimney, nestled among tall pines at the lake's edge. The final shot tracks upward rapidly, transitioning from intimate to epic as the full mountain range comes into view, bathed in the golden light of sunrise breaking through scattered clouds.\" # @param {\"type\":\"string\"}\n",
249 | "negative_prompt = \"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly\" # @param {\"type\":\"string\"}\n",
250 | "width = 832 # @param {\"type\":\"number\"}\n",
251 | "height = 480 # @param {\"type\":\"number\"}\n",
252 | "seed = 0 # @param {\"type\":\"integer\"}\n",
253 | "steps = 25 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
254 | "cfg_scale = 2.05 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
255 | "sampler_name = \"res_multistep\" # @param [\"res_multistep\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
256 | "frames = 73 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
257 | "fps = 24 # @param {\"type\":\"integer\", \"min\":1, \"max\":60}\n",
258 | "\n",
259 | "with torch.inference_mode():\n",
260 | " generate_video(\n",
261 | " positive_prompt=positive_prompt,\n",
262 | " negative_prompt=negative_prompt,\n",
263 | " width=width,\n",
264 | " height=height,\n",
265 | " seed=seed,\n",
266 | " steps=steps,\n",
267 | " cfg_scale=cfg_scale,\n",
268 | " sampler_name=sampler_name,\n",
269 | " length=frames,\n",
270 | " fps=fps\n",
271 | " )\n",
272 | "clear_memory()"
273 | ],
274 | "metadata": {
275 | "cellView": "form",
276 | "id": "roC59_oNNflb"
277 | },
278 | "execution_count": null,
279 | "outputs": []
280 | }
281 | ]
282 | }
--------------------------------------------------------------------------------
/LTX_Video_with_Start_&_End_frames.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **LTX-VIDEO WITH START & END FRAMES**"
23 | ],
24 | "metadata": {
25 | "id": "f4p1ysFKMbs_"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "- You can use the free T4 GPU to run this. For faster video generation, use higher GPUs.\n",
32 | "- This notebook is mainly for generating animations from simple transitions between two images. It doesn't do well for fast motions like walking or running.\n",
33 | "- Use detailed prompts to generate good videos."
34 | ],
35 | "metadata": {
36 | "id": "EBB00lC6q-DA"
37 | }
38 | },
39 | {
40 | "cell_type": "code",
41 | "source": [
42 | "# @title Prepare Environment\n",
43 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
44 | "%cd /content\n",
45 | "Always_Load_Models_for_Inference = False\n",
46 | "Use_t5xxl_fp16 = False\n",
47 | "\n",
48 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
49 | "!pip install av\n",
50 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
51 | "%cd /content/ComfyUI\n",
52 | "!apt -y install -qq aria2 ffmpeg\n",
53 | "\n",
54 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors\n",
55 | "if Use_t5xxl_fp16:\n",
56 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors\n",
57 | "else:\n",
58 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors\n",
59 | "\n",
60 | "import torch\n",
61 | "import numpy as np\n",
62 | "from PIL import Image\n",
63 | "import gc\n",
64 | "import sys\n",
65 | "import random\n",
66 | "import os\n",
67 | "import imageio\n",
68 | "from google.colab import files\n",
69 | "from IPython.display import display, HTML\n",
70 | "sys.path.insert(0, '/content/ComfyUI')\n",
71 | "\n",
72 | "from comfy import model_management\n",
73 | "\n",
74 | "from nodes import (\n",
75 | " CheckpointLoaderSimple,\n",
76 | " CLIPLoader,\n",
77 | " CLIPTextEncode,\n",
78 | " VAEDecode,\n",
79 | " LoadImage\n",
80 | ")\n",
81 | "\n",
82 | "from comfy_extras.nodes_custom_sampler import (\n",
83 | " KSamplerSelect,\n",
84 | " SamplerCustom\n",
85 | ")\n",
86 | "\n",
87 | "from comfy_extras.nodes_lt import (\n",
88 | " EmptyLTXVLatentVideo,\n",
89 | " LTXVPreprocess,\n",
90 | " LTXVAddGuide,\n",
91 | " LTXVScheduler,\n",
92 | " LTXVConditioning,\n",
93 | " LTXVCropGuides\n",
94 | ")\n",
95 | "\n",
96 | "checkpoint_loader = CheckpointLoaderSimple()\n",
97 | "clip_loader = CLIPLoader()\n",
98 | "clip_encode_positive = CLIPTextEncode()\n",
99 | "clip_encode_negative = CLIPTextEncode()\n",
100 | "load_image = LoadImage()\n",
101 | "empty_latent = EmptyLTXVLatentVideo()\n",
102 | "preprocess = LTXVPreprocess()\n",
103 | "add_guide = LTXVAddGuide()\n",
104 | "scheduler = LTXVScheduler()\n",
105 | "sampler_select = KSamplerSelect()\n",
106 | "conditioning = LTXVConditioning()\n",
107 | "sampler = SamplerCustom()\n",
108 | "vae_decode = VAEDecode()\n",
109 | "crop_guides = LTXVCropGuides()\n",
110 | "\n",
111 | "def clear_gpu_memory():\n",
112 | " gc.collect()\n",
113 | " if torch.cuda.is_available():\n",
114 | " torch.cuda.empty_cache()\n",
115 | " torch.cuda.ipc_collect()\n",
116 | " for obj in list(globals().values()):\n",
117 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
118 | " del obj\n",
119 | " gc.collect()\n",
120 | "\n",
121 | "def upload_image():\n",
122 | " \"\"\"Handle image upload in Colab and store in /content/ComfyUI/input/\"\"\"\n",
123 | " from google.colab import files\n",
124 | " import os\n",
125 | " import shutil\n",
126 | "\n",
127 | " os.makedirs('/content/ComfyUI/input', exist_ok=True)\n",
128 | "\n",
129 | " uploaded = files.upload()\n",
130 | "\n",
131 | " # Move each uploaded file to ComfyUI input directory\n",
132 | " for filename in uploaded.keys():\n",
133 | " src_path = f'/content/ComfyUI/{filename}'\n",
134 | " dest_path = f'/content/ComfyUI/input/{filename}'\n",
135 | "\n",
136 | " shutil.move(src_path, dest_path)\n",
137 | " print(f\"Image saved to: {dest_path}\")\n",
138 | " return dest_path\n",
139 | "\n",
140 | " return None\n",
141 | "\n",
142 | "def generate_video(\n",
143 | " image_path: str = None,\n",
144 | " guide_image_path: str = None,\n",
145 | " positive_prompt: str = \"A red fox moving gracefully, its russet coat vibrant against the white landscape, leaving perfect star-shaped prints behind as steam rises from its breath in the crisp winter air. The scene is wrapped in snow-muffled silence, broken only by the gentle murmur of water still flowing beneath the ice.\",\n",
146 | " negative_prompt: str = \"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly\",\n",
147 | " width: int = 768,\n",
148 | " height: int = 512,\n",
149 | " seed: int = 397166166231987,\n",
150 | " steps: int = 30,\n",
151 | " cfg_scale: float = 2.05,\n",
152 | " sampler_name: str = \"euler\",\n",
153 | " length: int = 97,\n",
154 | " fps: int = 24,\n",
155 | " guide_strength: float = 0.1,\n",
156 | " guide_frame: int = -1\n",
157 | "):\n",
158 | " with torch.inference_mode():\n",
159 | " print(\"Loading Text_Encoder...\")\n",
160 | " clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
161 | " print(\"Loaded Text_Encoder!\")\n",
162 | "\n",
163 | " try:\n",
164 | " assert width % 32 == 0, \"Width must be divisible by 32\"\n",
165 | " assert height % 32 == 0, \"Height must be divisible by 32\"\n",
166 | "\n",
167 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
168 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
169 | "\n",
170 | " del clip\n",
171 | " torch.cuda.empty_cache()\n",
172 | " gc.collect()\n",
173 | " print(\"Text_Encoder removed from memory\")\n",
174 | "\n",
175 | " if image_path is None:\n",
176 | " print(\"Please upload the main image file:\")\n",
177 | " image_path = upload_image()\n",
178 | " if image_path is None:\n",
179 | " print(\"No main image uploaded!\")\n",
180 | "\n",
181 | " if guide_image_path is None:\n",
182 | " print(\"Please upload the guide image file:\")\n",
183 | " guide_image_path = upload_image()\n",
184 | " if guide_image_path is None:\n",
185 | " print(\"No guide image uploaded!\")\n",
186 | "\n",
187 | " loaded_image = load_image.load_image(image_path)[0]\n",
188 | " processed_image = preprocess.preprocess(loaded_image, 35)[0]\n",
189 | "\n",
190 | " loaded_guide_image = load_image.load_image(guide_image_path)[0]\n",
191 | " processed_guide_image = preprocess.preprocess(loaded_guide_image, 40)[0]\n",
192 | "\n",
193 | " print(\"Loading model & VAE...\")\n",
194 | " model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
195 | " print(\"Loaded model & VAE!\")\n",
196 | "\n",
197 | " # Create empty latent video\n",
198 | " latent_video = empty_latent.generate(width, height, length)[0]\n",
199 | "\n",
200 | " # First guide pass\n",
201 | " guided_positive, guided_negative, guided_latent_1 = add_guide.generate(\n",
202 | " positive=positive,\n",
203 | " negative=negative,\n",
204 | " vae=vae,\n",
205 | " latent=latent_video,\n",
206 | " image=processed_image,\n",
207 | " frame_idx=0,\n",
208 | " strength=1\n",
209 | " )\n",
210 | "\n",
211 | " # Second guide pass (from the other image)\n",
212 | " guided_positive, guided_negative, guided_latent = add_guide.generate(\n",
213 | " positive=guided_positive,\n",
214 | " negative=guided_negative,\n",
215 | " vae=vae,\n",
216 | " latent=guided_latent_1,\n",
217 | " image=processed_guide_image,\n",
218 | " frame_idx=guide_frame,\n",
219 | " strength=guide_strength\n",
220 | " )\n",
221 | "\n",
222 | " # Get sigmas for sampling\n",
223 | " sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1, guided_latent_1)[0]\n",
224 | " selected_sampler = sampler_select.get_sampler(sampler_name)[0]\n",
225 | "\n",
226 | " # Apply conditioning\n",
227 | " conditioned_positive, conditioned_negative = conditioning.append(\n",
228 | " guided_positive,\n",
229 | " guided_negative,\n",
230 | " 25.0\n",
231 | " )\n",
232 | "\n",
233 | " print(\"Generating video...\")\n",
234 | "\n",
235 | " # Sample the video\n",
236 | " sampled = sampler.sample(\n",
237 | " model=model,\n",
238 | " add_noise=True,\n",
239 | " noise_seed=seed if seed != 0 else random.randint(0, 2**32),\n",
240 | " cfg=cfg_scale,\n",
241 | " positive=conditioned_positive,\n",
242 | " negative=conditioned_negative,\n",
243 | " sampler=selected_sampler,\n",
244 | " sigmas=sigmas,\n",
245 | " latent_image=guided_latent\n",
246 | " )[0]\n",
247 | "\n",
248 | " # Crop guides if needed\n",
249 | " cropped_latent = crop_guides.crop(\n",
250 | " conditioned_positive,\n",
251 | " conditioned_negative,\n",
252 | " sampled\n",
253 | " )[2]\n",
254 | "\n",
255 | " del model\n",
256 | " torch.cuda.empty_cache()\n",
257 | " gc.collect()\n",
258 | " print(\"Model removed from memory\")\n",
259 | "\n",
260 | " with torch.no_grad():\n",
261 | " try:\n",
262 | " print(\"Decoding Latents...\")\n",
263 | " decoded = vae_decode.decode(vae, cropped_latent)[0].detach()\n",
264 | " print(\"Latents Decoded!\")\n",
265 | " del vae\n",
266 | " torch.cuda.empty_cache()\n",
267 | " gc.collect()\n",
268 | " print(\"VAE removed from memory\")\n",
269 | " except Exception as e:\n",
270 | " print(f\"Error during decoding: {str(e)}\")\n",
271 | " raise\n",
272 | "\n",
273 | " # Save as MP4\n",
274 | " output_path = \"/content/output.mp4\"\n",
275 | " frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)\n",
276 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
277 | " for frame in frames_np:\n",
278 | " writer.append_data(frame)\n",
279 | "\n",
280 | " print(f\"\\nVideo generation complete!\")\n",
281 | " print(f\"Saved {len(decoded)} frames to {output_path}\")\n",
282 | " display_video(output_path)\n",
283 | "\n",
284 | " except Exception as e:\n",
285 | " print(f\"Error during video generation: {str(e)}\")\n",
286 | " raise\n",
287 | " finally:\n",
288 | " clear_gpu_memory()\n",
289 | "\n",
290 | "def display_video(video_path):\n",
291 | " \"\"\"Display video in Colab notebook with proper HTML5 player\"\"\"\n",
292 | " from IPython.display import HTML\n",
293 | " from base64 import b64encode\n",
294 | "\n",
295 | " mp4 = open(video_path,'rb').read()\n",
296 | " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
297 | "\n",
298 | " display(HTML(f\"\"\"\n",
299 | " \n",
302 | " \"\"\"))\n",
303 | "\n"
304 | ],
305 | "metadata": {
306 | "cellView": "form",
307 | "id": "rrXFIT4fMfyJ"
308 | },
309 | "execution_count": null,
310 | "outputs": []
311 | },
312 | {
313 | "cell_type": "code",
314 | "source": [
315 | "# @title Run Video Generation\n",
316 | "positive_prompt = \"Flowers growing from the sides of a vase\" # @param {\"type\":\"string\"}\n",
317 | "negative_prompt = \"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly\" # @param {\"type\":\"string\"}\n",
318 | "width = 512 # @param {\"type\":\"number\"}\n",
319 | "height = 768 # @param {\"type\":\"number\"}\n",
320 | "seed = 397166166231987 # @param {\"type\":\"integer\"}\n",
321 | "steps = 25 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
322 | "cfg_scale = 2.05 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
323 | "sampler_name = \"euler\" # @param [\"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
324 | "frames = 49 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
325 | "guide_strength = 1 # @param {\"type\":\"number\", \"min\":0, \"max\":1}\n",
326 | "guide_frame = -1 # @param {\"type\":\"integer\"}\n",
327 | "\n",
328 | "# @title Run Video Generation\n",
329 | "print(\"Starting video generation workflow...\")\n",
330 | "with torch.inference_mode():\n",
331 | " generate_video(\n",
332 | " image_path=None,\n",
333 | " guide_image_path=None,\n",
334 | " positive_prompt=positive_prompt,\n",
335 | " negative_prompt=negative_prompt,\n",
336 | " width=width,\n",
337 | " height=height,\n",
338 | " seed=seed,\n",
339 | " steps=steps,\n",
340 | " cfg_scale=cfg_scale,\n",
341 | " sampler_name=sampler_name,\n",
342 | " length=frames,\n",
343 | " guide_strength=guide_strength,\n",
344 | " guide_frame=guide_frame\n",
345 | " )\n",
346 | "clear_gpu_memory()"
347 | ],
348 | "metadata": {
349 | "cellView": "form",
350 | "id": "roC59_oNNflb"
351 | },
352 | "execution_count": null,
353 | "outputs": []
354 | }
355 | ]
356 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Google-Colab_Notebooks
2 | A Collection of Google Colab Notebooks for scripts & projects
3 |
4 | | Notebook | Info
5 | | --- | --- |
6 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/video2Gif_(Basic).ipynb) | Video to Gif (basic)
7 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/LatentSync.ipynb) | Lip Sync with LatentSync
8 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Animate_X.ipynb) | Animation with Animate-X
9 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/ZonosTTS.ipynb) | TTS with ZonosTTS
10 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_T2V_1_3B_DiffSynth.ipynb) | T2V & T2I with Wan2.1_T2V_1.3B (Compute units Required)
11 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_T2V_14B.ipynb) | T2V & T2I with Wan2_1_T2V_14B (Compute units Required)
12 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_I2V_14B.ipynb) | Image to Video with Wan2_1_I2V_14B-480P (Compute units Required)
13 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/FrameInterpolationRIFE.ipynb) | Frame Interpolation with RIFE
14 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/LTX_Video_Img_to_Vid.ipynb) | Basic Image to Video with LTX-Video
15 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/LTX_Video_Tx_to_Vid.ipynb) | Text to Video with LTX-Video
16 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/LTX_Video_with_Start_&_End_frames.ipynb) | Two Images to Video with LTX-Video
17 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_1_3B_T2V_Free.ipynb) | T2V & T2I with Wan2.1_T2V_1.3B
18 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_14B_T2V_GGUF_Free.ipynb) | T2V & T2I with Wan2.1_T2V_14B GGUF Models
19 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_14B_I2V_GGUF_Free.ipynb) | Image to Video with Wan2.1_I2V_14B-480p GGUF Models
20 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Wan2_1_14B_I2V_GGUF_&_LoRA.ipynb) | I2V with Wan2.1_I2V_14B-480p GGUF Models & LoRA
21 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Fast_Wan2_1_14B_I2V_480p_GGUF_&_LoRA.ipynb) | Faster I2V with Wan2.1_I2V_14B-480p GGUF Models, LoRA & Teacache
22 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Hidream_fp8.ipynb) | Hidream_fp8 for Text to Image Generation (Compute units Required)
23 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Hidream_T2V_GGUF_Q5.ipynb) | Hidream_GGUF_Q5 for Text to Image Generation
24 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/FramePack.ipynb) | FramePack_fp8 for Image to Video Generation (Compute units Required)
25 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Flux_Upscaler.ipynb) | Upscale Images & Videos with Flux Upscaler
26 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/ICEdit.ipynb) | Use Prompts to Edit Images with In-Context Edit
27 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/LTXV_0_9_7_13B_Distilled_Image_to_Video.ipynb) | LTXV-0.9.7-13B_Distilled_GGUF_Q6 for Image to Video Generation
28 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Basic_Wan2_1_VACE_&_CausVid_LoRA_4_Text_to_Video(WIP).ipynb) | Wan 2.1 VACE 14B & CausVid LoRA for faster Text to Video Generation
29 | [](https://colab.research.google.com/github/Isi-dev/Google-Colab_Notebooks/blob/main/Basic_Wan2_1_VACE_&_CausVid_LoRA_4_Image_to_Video.ipynb) | Wan 2.1 VACE 14B & CausVid LoRA for faster Image to Video Generation
30 |
31 |
32 | ## Notebook Guide
33 | Watch how these notebooks work and what they can create in [this YouTube playlist](https://www.youtube.com/playlist?list=PLdi1sS5pbSYeA470Sb1wARR4OieCBIqMv)
34 |
35 | ## Support
36 | If you find these notebooks helpful, you can support me here: [](https://buymeacoffee.com/isiomo)
37 |
38 | ## Contributing
39 | I currently don't have the time to review pull requests. If you find any bugs or run into issues, please report them in the Issues section. Thanks!
40 |
41 |
42 |
--------------------------------------------------------------------------------
/Wan2_1_14B_I2V_GGUF_&_LoRA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **WAN IMAGE TO VIDEO WITH Q4 & Q6 GGUF MODELS AND LORA SUPPORT**\n",
23 | "- You can use the free T4 GPU to run this notebook with the default Q4 GGUF model. I recommend that you use higher GPUs for the Q6 GGUF model.\n",
24 | "- **To use a lora, put its huggingface download link in the `lora_hf_download_url` textbox or select the `download_lora_from_civitai` checkbox and input a LoRA's download link and your civitai token before running the code to `Prepare Environment`. Remember to describe the main subject of the image and include the trigger words for the LoRA in the prompt. For the default rotation lora, the trigger word is r0t4tion 360 degrees rotation. You can get loras from this huggingface repository: https://huggingface.co/collections/Remade-AI/wan21-14b-480p-i2v-loras-67d0e26f08092436b585919b and from civitai: https://civitai.com/models. In civitai, set the `Wan Video` and `LoRA` filters to see the Wan loRAs.**\n",
25 | "- Generating a video from this flux image (https://comfyanonymous.github.io/ComfyUI_examples/flux/) with the default settings (512x512, 20 steps, 49 frames) using the Q4 GGUF model and the L4 GPU took about 10 minutes. Generating a video using the Q4 GGUF model and the free T4 GPU with these settings (480x480, 20 steps, 65 frames) took about 33 minutes.\n",
26 | "- The videos are generated at 16fps. You can use the `Frame Interpolation` notebook in this github repository (https://github.com/Isi-dev/Google-Colab_Notebooks) to increase it."
27 | ],
28 | "metadata": {
29 | "id": "D2lupAO0HdyH"
30 | }
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {
36 | "id": "96675e45HYsu",
37 | "cellView": "form"
38 | },
39 | "outputs": [],
40 | "source": [
41 | "# @title Prepare Environment\n",
42 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
43 | "%cd /content\n",
44 | "\n",
45 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
46 | "!pip install av\n",
47 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
48 | "%cd /content/ComfyUI/custom_nodes\n",
49 | "!git clone https://github.com/Isi-dev/ComfyUI_GGUF.git\n",
50 | "%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF\n",
51 | "!pip install -r requirements.txt\n",
52 | "%cd /content/ComfyUI\n",
53 | "!apt -y install -qq aria2 ffmpeg\n",
54 | "\n",
55 | "useQ6 = False # @param {\"type\":\"boolean\"}\n",
56 | "\n",
57 | "lora = \"flyingEffect.safetensors\"\n",
58 | "\n",
59 | "lora_hf_download_url = \"https://huggingface.co/Remade-AI/Rotate/resolve/main/rotate_20_epochs.safetensors\"# @param {\"type\":\"string\"}\n",
60 | "download_lora_from_civitai = False # @param {\"type\":\"boolean\"}\n",
61 | "lora_civitai_download_url = \"https://civitai.com/api/download/models/1542806?type=Model&format=SafeTensor\"# @param {\"type\":\"string\"}\n",
62 | "civitai_token = \"Put your civitai token here\"# @param {\"type\":\"string\"}\n",
63 | "\n",
64 | "def download_with_aria2c(link, folder=\"/content/ComfyUI/models/loras\"):\n",
65 | " import os\n",
66 | "\n",
67 | " filename = link.split(\"/\")[-1]\n",
68 | " command = f\"aria2c --console-log-level=error -c -x 16 -s 16 -k 1M {link} -d {folder} -o {filename}\"\n",
69 | "\n",
70 | " print(\"Executing download command:\")\n",
71 | " print(command)\n",
72 | "\n",
73 | " os.makedirs(folder, exist_ok=True)\n",
74 | " get_ipython().system(command)\n",
75 | "\n",
76 | " return filename\n",
77 | "\n",
78 | "\n",
79 | "\n",
80 | "def download_civitai_model(civitai_link, civitai_token, folder=\"/content/ComfyUI/models/loras\"):\n",
81 | " import os\n",
82 | " import time\n",
83 | "\n",
84 | " os.makedirs(folder, exist_ok=True)\n",
85 | "\n",
86 | " try:\n",
87 | " model_id = civitai_link.split(\"/models/\")[1].split(\"?\")[0]\n",
88 | " except IndexError:\n",
89 | " raise ValueError(\"Invalid Civitai URL format. Please use a link like: https://civitai.com/api/download/models/1523247?...\")\n",
90 | "\n",
91 | " civitai_url = f\"https://civitai.com/api/download/models/{model_id}?type=Model&format=SafeTensor\"\n",
92 | " if civitai_token:\n",
93 | " civitai_url += f\"&token={civitai_token}\"\n",
94 | "\n",
95 | " timestamp = time.strftime(\"%Y%m%d_%H%M%S\")\n",
96 | " filename = f\"model_{timestamp}.safetensors\"\n",
97 | "\n",
98 | " full_path = os.path.join(folder, filename)\n",
99 | "\n",
100 | " download_command = f\"wget --max-redirect=10 --show-progress \\\"{civitai_url}\\\" -O \\\"{full_path}\\\"\"\n",
101 | " print(\"Downloading from Civitai...\")\n",
102 | "\n",
103 | " os.system(download_command)\n",
104 | "\n",
105 | " local_path = os.path.join(folder, filename)\n",
106 | " if os.path.exists(local_path) and os.path.getsize(local_path) > 0:\n",
107 | " print(f\"LoRA downloaded successfully: {local_path}\")\n",
108 | " else:\n",
109 | " print(f\"❌ LoRA download failed or file is empty: {local_path}\")\n",
110 | "\n",
111 | " return filename\n",
112 | "\n",
113 | "if download_lora_from_civitai:\n",
114 | " lora = download_civitai_model(lora_civitai_download_url, civitai_token)\n",
115 | "else:\n",
116 | " lora = download_with_aria2c(lora_hf_download_url)\n",
117 | "\n",
118 | "\n",
119 | "if useQ6:\n",
120 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/resolve/main/wan2.1-i2v-14b-480p-Q6_K.gguf -d /content/ComfyUI/models/unet -o wan2.1-i2v-14b-480p-Q6_K.gguf\n",
121 | "else:\n",
122 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/resolve/main/wan2.1-i2v-14b-480p-Q4_0.gguf -d /content/ComfyUI/models/unet -o wan2.1-i2v-14b-480p-Q4_0.gguf\n",
123 | "\n",
124 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o umt5_xxl_fp8_e4m3fn_scaled.safetensors\n",
125 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors -d /content/ComfyUI/models/vae -o wan_2.1_vae.safetensors\n",
126 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/clip_vision/clip_vision_h.safetensors -d /content/ComfyUI/models/clip_vision -o clip_vision_h.safetensors\n",
127 | "\n",
128 | "\n",
129 | "\n",
130 | "import torch\n",
131 | "import numpy as np\n",
132 | "from PIL import Image\n",
133 | "import gc\n",
134 | "import sys\n",
135 | "import random\n",
136 | "import os\n",
137 | "import imageio\n",
138 | "import subprocess\n",
139 | "from google.colab import files\n",
140 | "from IPython.display import display, HTML, Image as IPImage\n",
141 | "sys.path.insert(0, '/content/ComfyUI')\n",
142 | "\n",
143 | "from comfy import model_management\n",
144 | "\n",
145 | "from nodes import (\n",
146 | " CheckpointLoaderSimple,\n",
147 | " CLIPLoader,\n",
148 | " CLIPTextEncode,\n",
149 | " VAEDecode,\n",
150 | " VAELoader,\n",
151 | " KSampler,\n",
152 | " UNETLoader,\n",
153 | " LoadImage,\n",
154 | " CLIPVisionLoader,\n",
155 | " CLIPVisionEncode,\n",
156 | " LoraLoaderModelOnly\n",
157 | ")\n",
158 | "\n",
159 | "from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF\n",
160 | "from comfy_extras.nodes_model_advanced import ModelSamplingSD3\n",
161 | "from comfy_extras.nodes_images import SaveAnimatedWEBP\n",
162 | "from comfy_extras.nodes_video import SaveWEBM\n",
163 | "from comfy_extras.nodes_wan import WanImageToVideo\n",
164 | "\n",
165 | "# Initialize nodes\n",
166 | "unet_loader = UnetLoaderGGUF()\n",
167 | "model_sampling = ModelSamplingSD3()\n",
168 | "clip_loader = CLIPLoader()\n",
169 | "clip_encode_positive = CLIPTextEncode()\n",
170 | "clip_encode_negative = CLIPTextEncode()\n",
171 | "vae_loader = VAELoader()\n",
172 | "clip_vision_loader = CLIPVisionLoader()\n",
173 | "clip_vision_encode = CLIPVisionEncode()\n",
174 | "load_image = LoadImage()\n",
175 | "wan_image_to_video = WanImageToVideo()\n",
176 | "ksampler = KSampler()\n",
177 | "vae_decode = VAEDecode()\n",
178 | "save_webp = SaveAnimatedWEBP()\n",
179 | "save_webm = SaveWEBM()\n",
180 | "load_lora = LoraLoaderModelOnly()\n",
181 | "\n",
182 | "def clear_memory():\n",
183 | " gc.collect()\n",
184 | " if torch.cuda.is_available():\n",
185 | " torch.cuda.empty_cache()\n",
186 | " torch.cuda.ipc_collect()\n",
187 | " for obj in list(globals().values()):\n",
188 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
189 | " del obj\n",
190 | " gc.collect()\n",
191 | "\n",
192 | "def save_as_mp4(images, filename_prefix, fps, output_dir=\"/content/ComfyUI/output\"):\n",
193 | " os.makedirs(output_dir, exist_ok=True)\n",
194 | " output_path = f\"{output_dir}/{filename_prefix}.mp4\"\n",
195 | "\n",
196 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
197 | "\n",
198 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
199 | " for frame in frames:\n",
200 | " writer.append_data(frame)\n",
201 | "\n",
202 | " return output_path\n",
203 | "\n",
204 | "def save_as_webp(images, filename_prefix, fps, quality=90, lossless=False, method=4, output_dir=\"/content/ComfyUI/output\"):\n",
205 | " \"\"\"Save images as animated WEBP using imageio.\"\"\"\n",
206 | " os.makedirs(output_dir, exist_ok=True)\n",
207 | " output_path = f\"{output_dir}/{filename_prefix}.webp\"\n",
208 | "\n",
209 | "\n",
210 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
211 | "\n",
212 | "\n",
213 | " kwargs = {\n",
214 | " 'fps': int(fps),\n",
215 | " 'quality': int(quality),\n",
216 | " 'lossless': bool(lossless),\n",
217 | " 'method': int(method)\n",
218 | " }\n",
219 | "\n",
220 | " with imageio.get_writer(\n",
221 | " output_path,\n",
222 | " format='WEBP',\n",
223 | " mode='I',\n",
224 | " **kwargs\n",
225 | " ) as writer:\n",
226 | " for frame in frames:\n",
227 | " writer.append_data(frame)\n",
228 | "\n",
229 | " return output_path\n",
230 | "\n",
231 | "def save_as_webm(images, filename_prefix, fps, codec=\"vp9\", quality=32, output_dir=\"/content/ComfyUI/output\"):\n",
232 | " \"\"\"Save images as WEBM using imageio.\"\"\"\n",
233 | " os.makedirs(output_dir, exist_ok=True)\n",
234 | " output_path = f\"{output_dir}/{filename_prefix}.webm\"\n",
235 | "\n",
236 | "\n",
237 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
238 | "\n",
239 | "\n",
240 | " kwargs = {\n",
241 | " 'fps': int(fps),\n",
242 | " 'quality': int(quality),\n",
243 | " 'codec': str(codec),\n",
244 | " 'output_params': ['-crf', str(int(quality))]\n",
245 | " }\n",
246 | "\n",
247 | " with imageio.get_writer(\n",
248 | " output_path,\n",
249 | " format='FFMPEG',\n",
250 | " mode='I',\n",
251 | " **kwargs\n",
252 | " ) as writer:\n",
253 | " for frame in frames:\n",
254 | " writer.append_data(frame)\n",
255 | "\n",
256 | " return output_path\n",
257 | "\n",
258 | "def save_as_image(image, filename_prefix, output_dir=\"/content/ComfyUI/output\"):\n",
259 | " \"\"\"Save single frame as PNG image.\"\"\"\n",
260 | " os.makedirs(output_dir, exist_ok=True)\n",
261 | " output_path = f\"{output_dir}/{filename_prefix}.png\"\n",
262 | "\n",
263 | " frame = (image.cpu().numpy() * 255).astype(np.uint8)\n",
264 | "\n",
265 | " Image.fromarray(frame).save(output_path)\n",
266 | "\n",
267 | " return output_path\n",
268 | "\n",
269 | "\n",
270 | "def upload_image():\n",
271 | " \"\"\"Handle image upload in Colab and store in /content/ComfyUI/input/\"\"\"\n",
272 | " from google.colab import files\n",
273 | " import os\n",
274 | " import shutil\n",
275 | "\n",
276 | " os.makedirs('/content/ComfyUI/input', exist_ok=True)\n",
277 | "\n",
278 | " uploaded = files.upload()\n",
279 | "\n",
280 | " # Move each uploaded file to ComfyUI input directory\n",
281 | " for filename in uploaded.keys():\n",
282 | " src_path = f'/content/ComfyUI/{filename}'\n",
283 | " dest_path = f'/content/ComfyUI/input/{filename}'\n",
284 | "\n",
285 | " shutil.move(src_path, dest_path)\n",
286 | " print(f\"Image saved to: {dest_path}\")\n",
287 | " return dest_path\n",
288 | "\n",
289 | " return None\n",
290 | "\n",
291 | "def generate_video(\n",
292 | " image_path: str = None,\n",
293 | " LoRA_Strength: float = 1.00,\n",
294 | " positive_prompt: str = \"a cute anime girl with massive fennec ears and a big fluffy tail wearing a maid outfit turning around\",\n",
295 | " negative_prompt: str = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
296 | " width: int = 832,\n",
297 | " height: int = 480,\n",
298 | " seed: int = 82628696717253,\n",
299 | " steps: int = 20,\n",
300 | " cfg_scale: float = 1.0,\n",
301 | " sampler_name: str = \"uni_pc\",\n",
302 | " scheduler: str = \"simple\",\n",
303 | " frames: int = 33,\n",
304 | " fps: int = 16,\n",
305 | " output_format: str = \"mp4\",\n",
306 | " overwrite: bool = False\n",
307 | "):\n",
308 | "\n",
309 | " with torch.inference_mode():\n",
310 | " print(\"Loading Text_Encoder...\")\n",
311 | " clip = clip_loader.load_clip(\"umt5_xxl_fp8_e4m3fn_scaled.safetensors\", \"wan\", \"default\")[0]\n",
312 | "\n",
313 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
314 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
315 | "\n",
316 | " del clip\n",
317 | " torch.cuda.empty_cache()\n",
318 | " gc.collect()\n",
319 | "\n",
320 | " if image_path is None:\n",
321 | " print(\"Please upload an image file:\")\n",
322 | " image_path = upload_image()\n",
323 | " if image_path is None:\n",
324 | " print(\"No image uploaded!\")\n",
325 | " loaded_image = load_image.load_image(image_path)[0]\n",
326 | " clip_vision = clip_vision_loader.load_clip(\"clip_vision_h.safetensors\")[0]\n",
327 | " clip_vision_output = clip_vision_encode.encode(clip_vision, loaded_image, \"none\")[0]\n",
328 | "\n",
329 | " del clip_vision\n",
330 | " torch.cuda.empty_cache()\n",
331 | " gc.collect()\n",
332 | "\n",
333 | " print(\"Loading VAE...\")\n",
334 | " vae = vae_loader.load_vae(\"wan_2.1_vae.safetensors\")[0]\n",
335 | "\n",
336 | " positive_out, negative_out, latent = wan_image_to_video.encode(\n",
337 | " positive, negative, vae, width, height, frames, 1, loaded_image, clip_vision_output\n",
338 | " )\n",
339 | "\n",
340 | " print(\"Loading Unet Model...\")\n",
341 | " if useQ6:\n",
342 | " model = unet_loader.load_unet(\"wan2.1-i2v-14b-480p-Q6_K.gguf\")[0]\n",
343 | " else:\n",
344 | " model = unet_loader.load_unet(\"wan2.1-i2v-14b-480p-Q4_0.gguf\")[0]\n",
345 | " model = model_sampling.patch(model, 8)[0]\n",
346 | "\n",
347 | " print(\"Loading Lora...\")\n",
348 | " model = load_lora.load_lora_model_only(model, lora, LoRA_Strength)[0]\n",
349 | "\n",
350 | " print(\"Generating video...\")\n",
351 | " sampled = ksampler.sample(\n",
352 | " model=model,\n",
353 | " seed=seed,\n",
354 | " steps=steps,\n",
355 | " cfg=cfg_scale,\n",
356 | " sampler_name=sampler_name,\n",
357 | " scheduler=scheduler,\n",
358 | " positive=positive_out,\n",
359 | " negative=negative_out,\n",
360 | " latent_image=latent\n",
361 | " )[0]\n",
362 | "\n",
363 | " del model\n",
364 | " torch.cuda.empty_cache()\n",
365 | " gc.collect()\n",
366 | "\n",
367 | " try:\n",
368 | " print(\"Decoding latents...\")\n",
369 | " decoded = vae_decode.decode(vae, sampled)[0]\n",
370 | "\n",
371 | " del vae\n",
372 | " torch.cuda.empty_cache()\n",
373 | " gc.collect()\n",
374 | "\n",
375 | " output_path = \"\"\n",
376 | " import datetime\n",
377 | " base_name = \"ComfyUI\"\n",
378 | " if not overwrite:\n",
379 | " timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
380 | " base_name += f\"_{timestamp}\"\n",
381 | " if frames == 1:\n",
382 | " print(\"Single frame detected - saving as PNG image...\")\n",
383 | " output_path = save_as_image(decoded[0], base_name)\n",
384 | " # print(f\"Image saved as PNG: {output_path}\")\n",
385 | "\n",
386 | " display(IPImage(filename=output_path))\n",
387 | " else:\n",
388 | " if output_format.lower() == \"webm\":\n",
389 | " print(\"Saving as WEBM...\")\n",
390 | " output_path = save_as_webm(\n",
391 | " decoded,\n",
392 | " base_name,\n",
393 | " fps=fps,\n",
394 | " codec=\"vp9\",\n",
395 | " quality=10\n",
396 | " )\n",
397 | " elif output_format.lower() == \"mp4\":\n",
398 | "\n",
399 | " print(\"Saving as MP4...\")\n",
400 | "\n",
401 | " output_path = save_as_mp4(decoded, base_name, fps)\n",
402 | "\n",
403 | " # output_path = save_as_mp4(decoded, \"ComfyUI\", fps)\n",
404 | " else:\n",
405 | " raise ValueError(f\"Unsupported output format: {output_format}\")\n",
406 | "\n",
407 | " # print(f\"Video saved as {output_format.upper()}: {output_path}\")\n",
408 | "\n",
409 | " display_video(output_path)\n",
410 | "\n",
411 | " except Exception as e:\n",
412 | " print(f\"Error during decoding/saving: {str(e)}\")\n",
413 | " raise\n",
414 | " finally:\n",
415 | " clear_memory()\n",
416 | "\n",
417 | "def display_video(video_path):\n",
418 | " from IPython.display import HTML\n",
419 | " from base64 import b64encode\n",
420 | "\n",
421 | " video_data = open(video_path,'rb').read()\n",
422 | "\n",
423 | " # Determine MIME type based on file extension\n",
424 | " if video_path.lower().endswith('.mp4'):\n",
425 | " mime_type = \"video/mp4\"\n",
426 | " elif video_path.lower().endswith('.webm'):\n",
427 | " mime_type = \"video/webm\"\n",
428 | " elif video_path.lower().endswith('.webp'):\n",
429 | " mime_type = \"image/webp\"\n",
430 | " else:\n",
431 | " mime_type = \"video/mp4\" # default\n",
432 | "\n",
433 | " data_url = f\"data:{mime_type};base64,\" + b64encode(video_data).decode()\n",
434 | "\n",
435 | " display(HTML(f\"\"\"\n",
436 | " \n",
439 | " \"\"\"))\n",
440 | "\n",
441 | "print(\"✅ Environment Setup Complete!\")\n",
442 | "\n",
443 | "\n",
444 | "\n"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "source": [],
450 | "metadata": {
451 | "id": "Qsx-E5IV4PrR"
452 | }
453 | },
454 | {
455 | "cell_type": "code",
456 | "source": [
457 | "# @title Generate Video\n",
458 | "\n",
459 | "LoRA_Strength = 1.0 # @param {\"type\":\"slider\",\"min\":-100,\"max\":100,\"step\":0.01}\n",
460 | "positive_prompt = \"Anime girl with massive fennec ears and a big fluffy tail wearing a maid outfit performs a r0t4tion 360 degrees rotation.\" # @param {\"type\":\"string\"}\n",
461 | "negative_prompt = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\" # @param {\"type\":\"string\"}\n",
462 | "width = 512 # @param {\"type\":\"number\"}\n",
463 | "height = 512 # @param {\"type\":\"number\"}\n",
464 | "seed = 0 # @param {\"type\":\"integer\"}\n",
465 | "steps = 20 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
466 | "cfg_scale = 3 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
467 | "sampler_name = \"uni_pc\" # @param [\"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
468 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
469 | "frames = 49 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
470 | "fps = 16 # @param {\"type\":\"integer\", \"min\":1, \"max\":60}\n",
471 | "output_format = \"mp4\" # @param [\"mp4\", \"webm\"]\n",
472 | "overwrite_previous_video = False # @param {type:\"boolean\"}\n",
473 | "\n",
474 | "import random\n",
475 | "seed = seed if seed != 0 else random.randint(0, 2**32 - 1)\n",
476 | "print(f\"Using seed: {seed}\")\n",
477 | "\n",
478 | "# with torch.inference_mode():\n",
479 | "generate_video(\n",
480 | " image_path=None,\n",
481 | " LoRA_Strength=LoRA_Strength,\n",
482 | " positive_prompt=positive_prompt,\n",
483 | " negative_prompt=negative_prompt,\n",
484 | " width=width,\n",
485 | " height=height,\n",
486 | " seed=seed,\n",
487 | " steps=steps,\n",
488 | " cfg_scale=cfg_scale,\n",
489 | " sampler_name=sampler_name,\n",
490 | " scheduler=scheduler,\n",
491 | " frames=frames,\n",
492 | " fps=fps,\n",
493 | " output_format=output_format,\n",
494 | " overwrite=overwrite_previous_video\n",
495 | ")\n",
496 | "clear_memory()"
497 | ],
498 | "metadata": {
499 | "cellView": "form",
500 | "id": "fcsJjujta1K9"
501 | },
502 | "execution_count": null,
503 | "outputs": []
504 | }
505 | ]
506 | }
--------------------------------------------------------------------------------
/Wan2_1_14B_I2V_GGUF_Free.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **WAN IMAGE TO VIDEO WITH Q4 & Q6 GGUF MODELS**\n",
23 | "- You can use the free T4 GPU to run this notebook with the default Q4 GGUF model. I recommend that you use higher GPUs for the Q6 GGUF model. This is the link to the main huggingface repository of the I2V 14B-480p GGUF models: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main\n",
24 | "- Generating a video from this flux image (https://comfyanonymous.github.io/ComfyUI_examples/flux/) with the default settings (512x512, 20 steps, 49 frames) using the Q4 GGUF model and the free T4 GPU took about 26 minutes.\n",
25 | "- Generating a video from a 720x1280 Image with a setting of 480x832, 20 steps, and 33 frames using the Q4 GGUF model and the free T4 GPU took 26 minutes 30 seconds. Generating the same video using the Q6 GGUF model and the L4 GPU took 10 minutes 10 seconds.\n",
26 | "- The videos are generated at 16fps. You can use the `Frame Interpolation` notebook in this github repository (https://github.com/Isi-dev/Google-Colab_Notebooks) to increase it."
27 | ],
28 | "metadata": {
29 | "id": "_2gwCJ8f3mjA"
30 | }
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {
36 | "id": "t089iwSddWDL"
37 | },
38 | "outputs": [],
39 | "source": [
40 | "# @title Prepare Environment\n",
41 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
42 | "%cd /content\n",
43 | "\n",
44 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
45 | "!pip install av\n",
46 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
47 | "%cd /content/ComfyUI/custom_nodes\n",
48 | "!git clone https://github.com/Isi-dev/ComfyUI_GGUF.git\n",
49 | "%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF\n",
50 | "!pip install -r requirements.txt\n",
51 | "%cd /content/ComfyUI\n",
52 | "!apt -y install -qq aria2 ffmpeg\n",
53 | "\n",
54 | "useQ6 = False # @param {\"type\":\"boolean\"}\n",
55 | "\n",
56 | "if useQ6:\n",
57 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/resolve/main/wan2.1-i2v-14b-480p-Q6_K.gguf -d /content/ComfyUI/models/unet -o wan2.1-i2v-14b-480p-Q6_K.gguf\n",
58 | "else:\n",
59 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/resolve/main/wan2.1-i2v-14b-480p-Q4_0.gguf -d /content/ComfyUI/models/unet -o wan2.1-i2v-14b-480p-Q4_0.gguf\n",
60 | "\n",
61 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o umt5_xxl_fp8_e4m3fn_scaled.safetensors\n",
62 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors -d /content/ComfyUI/models/vae -o wan_2.1_vae.safetensors\n",
63 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/clip_vision/clip_vision_h.safetensors -d /content/ComfyUI/models/clip_vision -o clip_vision_h.safetensors\n",
64 | "\n",
65 | "import torch\n",
66 | "import numpy as np\n",
67 | "from PIL import Image\n",
68 | "import gc\n",
69 | "import sys\n",
70 | "import random\n",
71 | "import os\n",
72 | "import imageio\n",
73 | "import subprocess\n",
74 | "from google.colab import files\n",
75 | "from IPython.display import display, HTML, Image as IPImage\n",
76 | "sys.path.insert(0, '/content/ComfyUI')\n",
77 | "\n",
78 | "from comfy import model_management\n",
79 | "\n",
80 | "from nodes import (\n",
81 | " CheckpointLoaderSimple,\n",
82 | " CLIPLoader,\n",
83 | " CLIPTextEncode,\n",
84 | " VAEDecode,\n",
85 | " VAELoader,\n",
86 | " KSampler,\n",
87 | " UNETLoader,\n",
88 | " LoadImage,\n",
89 | " CLIPVisionLoader,\n",
90 | " CLIPVisionEncode\n",
91 | ")\n",
92 | "\n",
93 | "from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF\n",
94 | "from comfy_extras.nodes_model_advanced import ModelSamplingSD3\n",
95 | "from comfy_extras.nodes_images import SaveAnimatedWEBP\n",
96 | "from comfy_extras.nodes_video import SaveWEBM\n",
97 | "from comfy_extras.nodes_wan import WanImageToVideo\n",
98 | "\n",
99 | "# Initialize nodes\n",
100 | "unet_loader = UnetLoaderGGUF()\n",
101 | "model_sampling = ModelSamplingSD3()\n",
102 | "clip_loader = CLIPLoader()\n",
103 | "clip_encode_positive = CLIPTextEncode()\n",
104 | "clip_encode_negative = CLIPTextEncode()\n",
105 | "vae_loader = VAELoader()\n",
106 | "clip_vision_loader = CLIPVisionLoader()\n",
107 | "clip_vision_encode = CLIPVisionEncode()\n",
108 | "load_image = LoadImage()\n",
109 | "wan_image_to_video = WanImageToVideo()\n",
110 | "ksampler = KSampler()\n",
111 | "vae_decode = VAEDecode()\n",
112 | "save_webp = SaveAnimatedWEBP()\n",
113 | "save_webm = SaveWEBM()\n",
114 | "\n",
115 | "def clear_memory():\n",
116 | " gc.collect()\n",
117 | " if torch.cuda.is_available():\n",
118 | " torch.cuda.empty_cache()\n",
119 | " torch.cuda.ipc_collect()\n",
120 | " for obj in list(globals().values()):\n",
121 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
122 | " del obj\n",
123 | " gc.collect()\n",
124 | "\n",
125 | "def save_as_mp4(images, filename_prefix, fps, output_dir=\"/content/ComfyUI/output\"):\n",
126 | " os.makedirs(output_dir, exist_ok=True)\n",
127 | " output_path = f\"{output_dir}/{filename_prefix}.mp4\"\n",
128 | "\n",
129 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
130 | "\n",
131 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
132 | " for frame in frames:\n",
133 | " writer.append_data(frame)\n",
134 | "\n",
135 | " return output_path\n",
136 | "\n",
137 | "def save_as_webp(images, filename_prefix, fps, quality=90, lossless=False, method=4, output_dir=\"/content/ComfyUI/output\"):\n",
138 | " \"\"\"Save images as animated WEBP using imageio.\"\"\"\n",
139 | " os.makedirs(output_dir, exist_ok=True)\n",
140 | " output_path = f\"{output_dir}/{filename_prefix}.webp\"\n",
141 | "\n",
142 | "\n",
143 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
144 | "\n",
145 | "\n",
146 | " kwargs = {\n",
147 | " 'fps': int(fps),\n",
148 | " 'quality': int(quality),\n",
149 | " 'lossless': bool(lossless),\n",
150 | " 'method': int(method)\n",
151 | " }\n",
152 | "\n",
153 | " with imageio.get_writer(\n",
154 | " output_path,\n",
155 | " format='WEBP',\n",
156 | " mode='I',\n",
157 | " **kwargs\n",
158 | " ) as writer:\n",
159 | " for frame in frames:\n",
160 | " writer.append_data(frame)\n",
161 | "\n",
162 | " return output_path\n",
163 | "\n",
164 | "def save_as_webm(images, filename_prefix, fps, codec=\"vp9\", quality=32, output_dir=\"/content/ComfyUI/output\"):\n",
165 | " \"\"\"Save images as WEBM using imageio.\"\"\"\n",
166 | " os.makedirs(output_dir, exist_ok=True)\n",
167 | " output_path = f\"{output_dir}/{filename_prefix}.webm\"\n",
168 | "\n",
169 | "\n",
170 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
171 | "\n",
172 | "\n",
173 | " kwargs = {\n",
174 | " 'fps': int(fps),\n",
175 | " 'quality': int(quality),\n",
176 | " 'codec': str(codec),\n",
177 | " 'output_params': ['-crf', str(int(quality))]\n",
178 | " }\n",
179 | "\n",
180 | " with imageio.get_writer(\n",
181 | " output_path,\n",
182 | " format='FFMPEG',\n",
183 | " mode='I',\n",
184 | " **kwargs\n",
185 | " ) as writer:\n",
186 | " for frame in frames:\n",
187 | " writer.append_data(frame)\n",
188 | "\n",
189 | " return output_path\n",
190 | "\n",
191 | "def save_as_image(image, filename_prefix, output_dir=\"/content/ComfyUI/output\"):\n",
192 | " \"\"\"Save single frame as PNG image.\"\"\"\n",
193 | " os.makedirs(output_dir, exist_ok=True)\n",
194 | " output_path = f\"{output_dir}/{filename_prefix}.png\"\n",
195 | "\n",
196 | " frame = (image.cpu().numpy() * 255).astype(np.uint8)\n",
197 | "\n",
198 | " Image.fromarray(frame).save(output_path)\n",
199 | "\n",
200 | " return output_path\n",
201 | "\n",
202 | "\n",
203 | "def upload_image():\n",
204 | " \"\"\"Handle image upload in Colab and store in /content/ComfyUI/input/\"\"\"\n",
205 | " from google.colab import files\n",
206 | " import os\n",
207 | " import shutil\n",
208 | "\n",
209 | " os.makedirs('/content/ComfyUI/input', exist_ok=True)\n",
210 | "\n",
211 | " uploaded = files.upload()\n",
212 | "\n",
213 | " # Move each uploaded file to ComfyUI input directory\n",
214 | " for filename in uploaded.keys():\n",
215 | " src_path = f'/content/ComfyUI/{filename}'\n",
216 | " dest_path = f'/content/ComfyUI/input/{filename}'\n",
217 | "\n",
218 | " shutil.move(src_path, dest_path)\n",
219 | " print(f\"Image saved to: {dest_path}\")\n",
220 | " return dest_path\n",
221 | "\n",
222 | " return None\n",
223 | "\n",
224 | "def generate_video(\n",
225 | " image_path: str = None,\n",
226 | " positive_prompt: str = \"a cute anime girl with massive fennec ears and a big fluffy tail wearing a maid outfit turning around\",\n",
227 | " negative_prompt: str = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
228 | " width: int = 832,\n",
229 | " height: int = 480,\n",
230 | " seed: int = 82628696717253,\n",
231 | " steps: int = 20,\n",
232 | " cfg_scale: float = 1.0,\n",
233 | " sampler_name: str = \"uni_pc\",\n",
234 | " scheduler: str = \"simple\",\n",
235 | " frames: int = 33,\n",
236 | " fps: int = 16,\n",
237 | " output_format: str = \"mp4\"\n",
238 | "):\n",
239 | "\n",
240 | " with torch.inference_mode():\n",
241 | " print(\"Loading Text_Encoder...\")\n",
242 | " clip = clip_loader.load_clip(\"umt5_xxl_fp8_e4m3fn_scaled.safetensors\", \"wan\", \"default\")[0]\n",
243 | "\n",
244 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
245 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
246 | "\n",
247 | " del clip\n",
248 | " torch.cuda.empty_cache()\n",
249 | " gc.collect()\n",
250 | "\n",
251 | " if image_path is None:\n",
252 | " print(\"Please upload an image file:\")\n",
253 | " image_path = upload_image()\n",
254 | " if image_path is None:\n",
255 | " print(\"No image uploaded!\")\n",
256 | " loaded_image = load_image.load_image(image_path)[0]\n",
257 | " clip_vision = clip_vision_loader.load_clip(\"clip_vision_h.safetensors\")[0]\n",
258 | " clip_vision_output = clip_vision_encode.encode(clip_vision, loaded_image, \"none\")[0]\n",
259 | "\n",
260 | " del clip_vision\n",
261 | " torch.cuda.empty_cache()\n",
262 | " gc.collect()\n",
263 | "\n",
264 | " print(\"Loading VAE...\")\n",
265 | " vae = vae_loader.load_vae(\"wan_2.1_vae.safetensors\")[0]\n",
266 | "\n",
267 | " positive_out, negative_out, latent = wan_image_to_video.encode(\n",
268 | " positive, negative, vae, width, height, frames, 1, loaded_image, clip_vision_output\n",
269 | " )\n",
270 | "\n",
271 | " print(\"Loading Unet Model...\")\n",
272 | " if useQ6:\n",
273 | " model = unet_loader.load_unet(\"wan2.1-i2v-14b-480p-Q6_K.gguf\")[0]\n",
274 | " else:\n",
275 | " model = unet_loader.load_unet(\"wan2.1-i2v-14b-480p-Q4_0.gguf\")[0]\n",
276 | " model = model_sampling.patch(model, 8)[0]\n",
277 | "\n",
278 | " print(\"Generating video...\")\n",
279 | " sampled = ksampler.sample(\n",
280 | " model=model,\n",
281 | " seed=seed,\n",
282 | " steps=steps,\n",
283 | " cfg=cfg_scale,\n",
284 | " sampler_name=sampler_name,\n",
285 | " scheduler=scheduler,\n",
286 | " positive=positive_out,\n",
287 | " negative=negative_out,\n",
288 | " latent_image=latent\n",
289 | " )[0]\n",
290 | "\n",
291 | " del model\n",
292 | " torch.cuda.empty_cache()\n",
293 | " gc.collect()\n",
294 | "\n",
295 | " try:\n",
296 | " print(\"Decoding latents...\")\n",
297 | " decoded = vae_decode.decode(vae, sampled)[0]\n",
298 | "\n",
299 | " del vae\n",
300 | " torch.cuda.empty_cache()\n",
301 | " gc.collect()\n",
302 | "\n",
303 | " output_path = \"\"\n",
304 | " if frames == 1:\n",
305 | " print(\"Single frame detected - saving as PNG image...\")\n",
306 | " output_path = save_as_image(decoded[0], \"ComfyUI\")\n",
307 | " # print(f\"Image saved as PNG: {output_path}\")\n",
308 | "\n",
309 | " display(IPImage(filename=output_path))\n",
310 | " else:\n",
311 | " if output_format.lower() == \"webm\":\n",
312 | " print(\"Saving as WEBM...\")\n",
313 | " output_path = save_as_webm(\n",
314 | " decoded,\n",
315 | " \"ComfyUI\",\n",
316 | " fps=fps,\n",
317 | " codec=\"vp9\",\n",
318 | " quality=10\n",
319 | " )\n",
320 | " elif output_format.lower() == \"mp4\":\n",
321 | " print(\"Saving as MP4...\")\n",
322 | " output_path = save_as_mp4(decoded, \"ComfyUI\", fps)\n",
323 | " else:\n",
324 | " raise ValueError(f\"Unsupported output format: {output_format}\")\n",
325 | "\n",
326 | " # print(f\"Video saved as {output_format.upper()}: {output_path}\")\n",
327 | "\n",
328 | " display_video(output_path)\n",
329 | "\n",
330 | " except Exception as e:\n",
331 | " print(f\"Error during decoding/saving: {str(e)}\")\n",
332 | " raise\n",
333 | " finally:\n",
334 | " clear_memory()\n",
335 | "\n",
336 | "def display_video(video_path):\n",
337 | " from IPython.display import HTML\n",
338 | " from base64 import b64encode\n",
339 | "\n",
340 | " video_data = open(video_path,'rb').read()\n",
341 | "\n",
342 | " # Determine MIME type based on file extension\n",
343 | " if video_path.lower().endswith('.mp4'):\n",
344 | " mime_type = \"video/mp4\"\n",
345 | " elif video_path.lower().endswith('.webm'):\n",
346 | " mime_type = \"video/webm\"\n",
347 | " elif video_path.lower().endswith('.webp'):\n",
348 | " mime_type = \"image/webp\"\n",
349 | " else:\n",
350 | " mime_type = \"video/mp4\" # default\n",
351 | "\n",
352 | " data_url = f\"data:{mime_type};base64,\" + b64encode(video_data).decode()\n",
353 | "\n",
354 | " display(HTML(f\"\"\"\n",
355 | " \n",
358 | " \"\"\"))\n",
359 | "\n",
360 | "print(\"✅ Environment Setup Complete!\")\n",
361 | "\n",
362 | "\n",
363 | "\n"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "source": [
369 | "# @title Generate Video\n",
370 | "\n",
371 | "positive_prompt = \"A cute anime girl with massive fennec ears and a big fluffy tail wearing a maid outfit turning around.\" # @param {\"type\":\"string\"}\n",
372 | "negative_prompt = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\" # @param {\"type\":\"string\"}\n",
373 | "width = 512 # @param {\"type\":\"number\"}\n",
374 | "height = 512 # @param {\"type\":\"number\"}\n",
375 | "seed = 0 # @param {\"type\":\"integer\"}\n",
376 | "steps = 20 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
377 | "cfg_scale = 3 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
378 | "sampler_name = \"uni_pc\" # @param [\"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
379 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
380 | "frames = 49 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
381 | "fps = 16 # @param {\"type\":\"integer\", \"min\":1, \"max\":60}\n",
382 | "output_format = \"mp4\" # @param [\"mp4\", \"webm\"]\n",
383 | "\n",
384 | "import random\n",
385 | "seed = seed if seed != 0 else random.randint(0, 2**32 - 1)\n",
386 | "print(f\"Using seed: {seed}\")\n",
387 | "\n",
388 | "# with torch.inference_mode():\n",
389 | "generate_video(\n",
390 | " image_path=None,\n",
391 | " positive_prompt=positive_prompt,\n",
392 | " negative_prompt=negative_prompt,\n",
393 | " width=width,\n",
394 | " height=height,\n",
395 | " seed=seed,\n",
396 | " steps=steps,\n",
397 | " cfg_scale=cfg_scale,\n",
398 | " sampler_name=sampler_name,\n",
399 | " scheduler=scheduler,\n",
400 | " frames=frames,\n",
401 | " fps=fps,\n",
402 | " output_format=output_format\n",
403 | ")\n",
404 | "clear_memory()"
405 | ],
406 | "metadata": {
407 | "cellView": "form",
408 | "id": "wo8w6tKerJMJ"
409 | },
410 | "execution_count": null,
411 | "outputs": []
412 | }
413 | ]
414 | }
--------------------------------------------------------------------------------
/Wan2_1_14B_T2V_GGUF_Free.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **Wan2.1-14b Quantized GGUF Models for Text to Video**"
23 | ],
24 | "metadata": {
25 | "id": "f4p1ysFKMbs_"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "- You can use the free T4 GPU to run this. For faster video generation, use higher GPUs.\n",
32 | "- Generating a video with 32 frames at 512 by 910 resolution can take up to 28 minutes on the T4 GPU using the Q5 model.\n",
33 | "- It's possible to generate up to 24 frames at 480 by 832 resolution for free using the Q6 model. The generation takes about 20 minutes.\n",
34 | "- Set `frames` to 1 to generate an image. You can generate high quality images with 720 by 1280 resolution using either the Q5 or Q6 gguf models. The generation takes less than 7 minutes."
35 | ],
36 | "metadata": {
37 | "id": "EBB00lC6q-DA"
38 | }
39 | },
40 | {
41 | "cell_type": "code",
42 | "source": [
43 | "# @title Prepare Environment\n",
44 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
45 | "%cd /content\n",
46 | "\n",
47 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
48 | "!pip install av\n",
49 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
50 | "%cd /content/ComfyUI/custom_nodes\n",
51 | "!git clone https://github.com/Isi-dev/ComfyUI_GGUF.git\n",
52 | "%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF\n",
53 | "!pip install -r requirements.txt\n",
54 | "%cd /content/ComfyUI\n",
55 | "!apt -y install -qq aria2 ffmpeg\n",
56 | "\n",
57 | "useQ6 = False # @param {\"type\":\"boolean\"}\n",
58 | "\n",
59 | "if useQ6:\n",
60 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/resolve/main/wan2.1-t2v-14b-Q6_K.gguf -d /content/ComfyUI/models/unet -o wan2.1-t2v-14b-Q6_K.gguf\n",
61 | "else:\n",
62 | " !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/resolve/main/wan2.1-t2v-14b-Q5_0.gguf -d /content/ComfyUI/models/unet -o wan2.1-t2v-14b-Q5_0.gguf\n",
63 | "\n",
64 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o umt5_xxl_fp8_e4m3fn_scaled.safetensors\n",
65 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors -d /content/ComfyUI/models/vae -o wan_2.1_vae.safetensors\n",
66 | "\n",
67 | "import torch\n",
68 | "import numpy as np\n",
69 | "from PIL import Image\n",
70 | "import gc\n",
71 | "import sys\n",
72 | "import random\n",
73 | "import os\n",
74 | "import imageio\n",
75 | "import subprocess\n",
76 | "from google.colab import files\n",
77 | "from IPython.display import display, HTML, Image as IPImage\n",
78 | "sys.path.insert(0, '/content/ComfyUI')\n",
79 | "\n",
80 | "from comfy import model_management\n",
81 | "\n",
82 | "from nodes import (\n",
83 | " CheckpointLoaderSimple,\n",
84 | " CLIPLoader,\n",
85 | " CLIPTextEncode,\n",
86 | " VAEDecode,\n",
87 | " VAELoader,\n",
88 | " KSampler,\n",
89 | " UNETLoader\n",
90 | ")\n",
91 | "\n",
92 | "from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF\n",
93 | "from comfy_extras.nodes_model_advanced import ModelSamplingSD3\n",
94 | "from comfy_extras.nodes_hunyuan import EmptyHunyuanLatentVideo\n",
95 | "from comfy_extras.nodes_images import SaveAnimatedWEBP\n",
96 | "from comfy_extras.nodes_video import SaveWEBM\n",
97 | "\n",
98 | "# unet_loader = UNETLoader()\n",
99 | "unet_loader = UnetLoaderGGUF()\n",
100 | "# model_sampling = ModelSamplingSD3()\n",
101 | "clip_loader = CLIPLoader()\n",
102 | "clip_encode_positive = CLIPTextEncode()\n",
103 | "clip_encode_negative = CLIPTextEncode()\n",
104 | "vae_loader = VAELoader()\n",
105 | "empty_latent_video = EmptyHunyuanLatentVideo()\n",
106 | "ksampler = KSampler()\n",
107 | "vae_decode = VAEDecode()\n",
108 | "save_webp = SaveAnimatedWEBP()\n",
109 | "save_webm = SaveWEBM()\n",
110 | "\n",
111 | "def clear_memory():\n",
112 | " gc.collect()\n",
113 | " if torch.cuda.is_available():\n",
114 | " torch.cuda.empty_cache()\n",
115 | " torch.cuda.ipc_collect()\n",
116 | " for obj in list(globals().values()):\n",
117 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
118 | " del obj\n",
119 | " gc.collect()\n",
120 | "\n",
121 | "def save_as_mp4(images, filename_prefix, fps, output_dir=\"/content/ComfyUI/output\"):\n",
122 | " os.makedirs(output_dir, exist_ok=True)\n",
123 | " output_path = f\"{output_dir}/{filename_prefix}.mp4\"\n",
124 | "\n",
125 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
126 | "\n",
127 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
128 | " for frame in frames:\n",
129 | " writer.append_data(frame)\n",
130 | "\n",
131 | " return output_path\n",
132 | "\n",
133 | "def save_as_webp(images, filename_prefix, fps, quality=90, lossless=False, method=4, output_dir=\"/content/ComfyUI/output\"):\n",
134 | " \"\"\"Save images as animated WEBP using imageio.\"\"\"\n",
135 | " os.makedirs(output_dir, exist_ok=True)\n",
136 | " output_path = f\"{output_dir}/{filename_prefix}.webp\"\n",
137 | "\n",
138 | "\n",
139 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
140 | "\n",
141 | "\n",
142 | " kwargs = {\n",
143 | " 'fps': int(fps),\n",
144 | " 'quality': int(quality),\n",
145 | " 'lossless': bool(lossless),\n",
146 | " 'method': int(method)\n",
147 | " }\n",
148 | "\n",
149 | " with imageio.get_writer(\n",
150 | " output_path,\n",
151 | " format='WEBP',\n",
152 | " mode='I',\n",
153 | " **kwargs\n",
154 | " ) as writer:\n",
155 | " for frame in frames:\n",
156 | " writer.append_data(frame)\n",
157 | "\n",
158 | " return output_path\n",
159 | "\n",
160 | "def save_as_webm(images, filename_prefix, fps, codec=\"vp9\", quality=32, output_dir=\"/content/ComfyUI/output\"):\n",
161 | " \"\"\"Save images as WEBM using imageio.\"\"\"\n",
162 | " os.makedirs(output_dir, exist_ok=True)\n",
163 | " output_path = f\"{output_dir}/{filename_prefix}.webm\"\n",
164 | "\n",
165 | "\n",
166 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
167 | "\n",
168 | "\n",
169 | " kwargs = {\n",
170 | " 'fps': int(fps),\n",
171 | " 'quality': int(quality),\n",
172 | " 'codec': str(codec),\n",
173 | " 'output_params': ['-crf', str(int(quality))]\n",
174 | " }\n",
175 | "\n",
176 | " with imageio.get_writer(\n",
177 | " output_path,\n",
178 | " format='FFMPEG',\n",
179 | " mode='I',\n",
180 | " **kwargs\n",
181 | " ) as writer:\n",
182 | " for frame in frames:\n",
183 | " writer.append_data(frame)\n",
184 | "\n",
185 | " return output_path\n",
186 | "\n",
187 | "def save_as_image(image, filename_prefix, output_dir=\"/content/ComfyUI/output\"):\n",
188 | " \"\"\"Save single frame as PNG image.\"\"\"\n",
189 | " os.makedirs(output_dir, exist_ok=True)\n",
190 | " output_path = f\"{output_dir}/{filename_prefix}.png\"\n",
191 | "\n",
192 | " frame = (image.cpu().numpy() * 255).astype(np.uint8)\n",
193 | "\n",
194 | " Image.fromarray(frame).save(output_path)\n",
195 | "\n",
196 | " return output_path\n",
197 | "\n",
198 | "def generate_video(\n",
199 | " positive_prompt: str = \"a fox moving quickly in a beautiful winter scenery nature trees mountains daytime tracking camera\",\n",
200 | " negative_prompt: str = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
201 | " width: int = 832,\n",
202 | " height: int = 480,\n",
203 | " seed: int = 82628696717253,\n",
204 | " steps: int = 30,\n",
205 | " cfg_scale: float = 1.0,\n",
206 | " sampler_name: str = \"uni_pc\",\n",
207 | " scheduler: str = \"simple\",\n",
208 | " frames: int = 33,\n",
209 | " fps: int = 16,\n",
210 | " output_format: str = \"mp4\"\n",
211 | "):\n",
212 | "\n",
213 | " with torch.inference_mode():\n",
214 | " print(\"Loading Text_Encoder...\")\n",
215 | " clip = clip_loader.load_clip(\"umt5_xxl_fp8_e4m3fn_scaled.safetensors\", \"wan\", \"default\")[0]\n",
216 | "\n",
217 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
218 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
219 | "\n",
220 | " del clip\n",
221 | " torch.cuda.empty_cache()\n",
222 | " gc.collect()\n",
223 | "\n",
224 | " empty_latent = empty_latent_video.generate(width, height, frames, 1)[0]\n",
225 | "\n",
226 | " print(\"Loading Unet Model...\")\n",
227 | " if useQ6:\n",
228 | " model = unet_loader.load_unet(\"wan2.1-t2v-14b-Q6_K.gguf\")[0]\n",
229 | " else:\n",
230 | " model = unet_loader.load_unet(\"wan2.1-t2v-14b-Q5_0.gguf\")[0]\n",
231 | " # model = model_sampling.patch(model, 8)[0]\n",
232 | "\n",
233 | " print(\"Generating video...\")\n",
234 | " sampled = ksampler.sample(\n",
235 | " model=model,\n",
236 | " seed=seed,\n",
237 | " steps=steps,\n",
238 | " cfg=cfg_scale,\n",
239 | " sampler_name=sampler_name,\n",
240 | " scheduler=scheduler,\n",
241 | " positive=positive,\n",
242 | " negative=negative,\n",
243 | " latent_image=empty_latent\n",
244 | " )[0]\n",
245 | "\n",
246 | " del model\n",
247 | " torch.cuda.empty_cache()\n",
248 | " gc.collect()\n",
249 | "\n",
250 | " print(\"Loading VAE...\")\n",
251 | " vae = vae_loader.load_vae(\"wan_2.1_vae.safetensors\")[0]\n",
252 | "\n",
253 | " try:\n",
254 | " print(\"Decoding latents...\")\n",
255 | " decoded = vae_decode.decode(vae, sampled)[0]\n",
256 | "\n",
257 | " del vae\n",
258 | " torch.cuda.empty_cache()\n",
259 | " gc.collect()\n",
260 | "\n",
261 | " output_path = \"\"\n",
262 | " if frames == 1:\n",
263 | " print(\"Single frame detected - saving as PNG image...\")\n",
264 | " output_path = save_as_image(decoded[0], \"ComfyUI\")\n",
265 | " # print(f\"Image saved as PNG: {output_path}\")\n",
266 | "\n",
267 | " display(IPImage(filename=output_path))\n",
268 | " else:\n",
269 | " if output_format.lower() == \"webm\":\n",
270 | " print(\"Saving as WEBM...\")\n",
271 | " output_path = save_as_webm(\n",
272 | " decoded,\n",
273 | " \"ComfyUI\",\n",
274 | " fps=fps,\n",
275 | " codec=\"vp9\",\n",
276 | " quality=10\n",
277 | " )\n",
278 | " elif output_format.lower() == \"mp4\":\n",
279 | " print(\"Saving as MP4...\")\n",
280 | " output_path = save_as_mp4(decoded, \"ComfyUI\", fps)\n",
281 | " else:\n",
282 | " raise ValueError(f\"Unsupported output format: {output_format}\")\n",
283 | "\n",
284 | " # print(f\"Video saved as {output_format.upper()}: {output_path}\")\n",
285 | "\n",
286 | " display_video(output_path)\n",
287 | "\n",
288 | " except Exception as e:\n",
289 | " print(f\"Error during decoding/saving: {str(e)}\")\n",
290 | " raise\n",
291 | " finally:\n",
292 | " clear_memory()\n",
293 | "\n",
294 | "def display_video(video_path):\n",
295 | " from IPython.display import HTML\n",
296 | " from base64 import b64encode\n",
297 | "\n",
298 | " video_data = open(video_path,'rb').read()\n",
299 | "\n",
300 | " # Determine MIME type based on file extension\n",
301 | " if video_path.lower().endswith('.mp4'):\n",
302 | " mime_type = \"video/mp4\"\n",
303 | " elif video_path.lower().endswith('.webm'):\n",
304 | " mime_type = \"video/webm\"\n",
305 | " elif video_path.lower().endswith('.webp'):\n",
306 | " mime_type = \"image/webp\"\n",
307 | " else:\n",
308 | " mime_type = \"video/mp4\" # default\n",
309 | "\n",
310 | " data_url = f\"data:{mime_type};base64,\" + b64encode(video_data).decode()\n",
311 | "\n",
312 | " display(HTML(f\"\"\"\n",
313 | " \n",
316 | " \"\"\"))\n",
317 | "\n",
318 | "print(\"✅ Environment Setup Complete!\")\n",
319 | "\n"
320 | ],
321 | "metadata": {
322 | "id": "rrXFIT4fMfyJ",
323 | "cellView": "form"
324 | },
325 | "execution_count": null,
326 | "outputs": []
327 | },
328 | {
329 | "cell_type": "code",
330 | "source": [
331 | "# @title Generate Video/Image\n",
332 | "\n",
333 | "positive_prompt = \"Close up view of a stunning, confident girl with curvaceous, wide hips, medium-weight figure, and a beautiful face walking with her partner down an empty street.\" # @param {\"type\":\"string\"}\n",
334 | "negative_prompt = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\" # @param {\"type\":\"string\"}\n",
335 | "width = 832 # @param {\"type\":\"number\"}\n",
336 | "height = 480 # @param {\"type\":\"number\"}\n",
337 | "seed = 82628696717258 # @param {\"type\":\"integer\"}\n",
338 | "steps = 20 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
339 | "cfg_scale = 3 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
340 | "sampler_name = \"uni_pc\" # @param [\"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
341 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
342 | "frames = 24 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
343 | "fps = 16 # @param {\"type\":\"integer\", \"min\":1, \"max\":60}\n",
344 | "output_format = \"mp4\" # @param [\"mp4\", \"webm\"]\n",
345 | "\n",
346 | "# with torch.inference_mode():\n",
347 | "generate_video(\n",
348 | " positive_prompt=positive_prompt,\n",
349 | " negative_prompt=negative_prompt,\n",
350 | " width=width,\n",
351 | " height=height,\n",
352 | " seed=seed,\n",
353 | " steps=steps,\n",
354 | " cfg_scale=cfg_scale,\n",
355 | " sampler_name=sampler_name,\n",
356 | " scheduler=scheduler,\n",
357 | " frames=frames,\n",
358 | " fps=fps,\n",
359 | " output_format=output_format\n",
360 | ")\n",
361 | "clear_memory()"
362 | ],
363 | "metadata": {
364 | "cellView": "form",
365 | "id": "roC59_oNNflb"
366 | },
367 | "execution_count": null,
368 | "outputs": []
369 | }
370 | ]
371 | }
--------------------------------------------------------------------------------
/Wan2_1_1_3B_T2V_Free.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "# **WAN2.1 1.3B Text to Video**"
23 | ],
24 | "metadata": {
25 | "id": "f4p1ysFKMbs_"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "- You can use the free T4 GPU to run this. For faster video generation, use higher GPUs.\n",
32 | "- Generating a video with 81 frames at 832 by 480 resolution can take up to 31 minutes on the T4 GPU.\n",
33 | "- Set `frames` to 1 to generate an image."
34 | ],
35 | "metadata": {
36 | "id": "EBB00lC6q-DA"
37 | }
38 | },
39 | {
40 | "cell_type": "code",
41 | "source": [
42 | "# @title Prepare Environment\n",
43 | "!pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
44 | "%cd /content\n",
45 | "\n",
46 | "!pip install -q torchsde einops diffusers accelerate xformers\n",
47 | "!pip install av\n",
48 | "!git clone https://github.com/Isi-dev/ComfyUI\n",
49 | "%cd /content/ComfyUI\n",
50 | "!apt -y install -qq aria2 ffmpeg\n",
51 | "\n",
52 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_t2v_1.3B_fp16.safetensors -d /content/ComfyUI/models/diffusion_models -o wan2.1_t2v_1.3B_fp16.safetensors\n",
53 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o umt5_xxl_fp8_e4m3fn_scaled.safetensors\n",
54 | "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors -d /content/ComfyUI/models/vae -o wan_2.1_vae.safetensors\n",
55 | "\n",
56 | "import torch\n",
57 | "import numpy as np\n",
58 | "from PIL import Image\n",
59 | "import gc\n",
60 | "import sys\n",
61 | "import random\n",
62 | "import os\n",
63 | "import imageio\n",
64 | "import subprocess\n",
65 | "from google.colab import files\n",
66 | "from IPython.display import display, HTML, Image as IPImage\n",
67 | "sys.path.insert(0, '/content/ComfyUI')\n",
68 | "\n",
69 | "from comfy import model_management\n",
70 | "\n",
71 | "from nodes import (\n",
72 | " CheckpointLoaderSimple,\n",
73 | " CLIPLoader,\n",
74 | " CLIPTextEncode,\n",
75 | " VAEDecode,\n",
76 | " VAELoader,\n",
77 | " KSampler,\n",
78 | " UNETLoader\n",
79 | ")\n",
80 | "\n",
81 | "from comfy_extras.nodes_model_advanced import ModelSamplingSD3\n",
82 | "from comfy_extras.nodes_hunyuan import EmptyHunyuanLatentVideo\n",
83 | "from comfy_extras.nodes_images import SaveAnimatedWEBP\n",
84 | "from comfy_extras.nodes_video import SaveWEBM\n",
85 | "\n",
86 | "unet_loader = UNETLoader()\n",
87 | "model_sampling = ModelSamplingSD3()\n",
88 | "clip_loader = CLIPLoader()\n",
89 | "clip_encode_positive = CLIPTextEncode()\n",
90 | "clip_encode_negative = CLIPTextEncode()\n",
91 | "vae_loader = VAELoader()\n",
92 | "empty_latent_video = EmptyHunyuanLatentVideo()\n",
93 | "ksampler = KSampler()\n",
94 | "vae_decode = VAEDecode()\n",
95 | "save_webp = SaveAnimatedWEBP()\n",
96 | "save_webm = SaveWEBM()\n",
97 | "\n",
98 | "def clear_memory():\n",
99 | " gc.collect()\n",
100 | " if torch.cuda.is_available():\n",
101 | " torch.cuda.empty_cache()\n",
102 | " torch.cuda.ipc_collect()\n",
103 | " for obj in list(globals().values()):\n",
104 | " if torch.is_tensor(obj) or (hasattr(obj, \"data\") and torch.is_tensor(obj.data)):\n",
105 | " del obj\n",
106 | " gc.collect()\n",
107 | "\n",
108 | "def save_as_mp4(images, filename_prefix, fps, output_dir=\"/content/ComfyUI/output\"):\n",
109 | " os.makedirs(output_dir, exist_ok=True)\n",
110 | " output_path = f\"{output_dir}/{filename_prefix}.mp4\"\n",
111 | "\n",
112 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
113 | "\n",
114 | " with imageio.get_writer(output_path, fps=fps) as writer:\n",
115 | " for frame in frames:\n",
116 | " writer.append_data(frame)\n",
117 | "\n",
118 | " return output_path\n",
119 | "\n",
120 | "def save_as_webp(images, filename_prefix, fps, quality=90, lossless=False, method=4, output_dir=\"/content/ComfyUI/output\"):\n",
121 | " \"\"\"Save images as animated WEBP using imageio.\"\"\"\n",
122 | " os.makedirs(output_dir, exist_ok=True)\n",
123 | " output_path = f\"{output_dir}/{filename_prefix}.webp\"\n",
124 | "\n",
125 | "\n",
126 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
127 | "\n",
128 | "\n",
129 | " kwargs = {\n",
130 | " 'fps': int(fps),\n",
131 | " 'quality': int(quality),\n",
132 | " 'lossless': bool(lossless),\n",
133 | " 'method': int(method)\n",
134 | " }\n",
135 | "\n",
136 | " with imageio.get_writer(\n",
137 | " output_path,\n",
138 | " format='WEBP',\n",
139 | " mode='I',\n",
140 | " **kwargs\n",
141 | " ) as writer:\n",
142 | " for frame in frames:\n",
143 | " writer.append_data(frame)\n",
144 | "\n",
145 | " return output_path\n",
146 | "\n",
147 | "def save_as_webm(images, filename_prefix, fps, codec=\"vp9\", quality=32, output_dir=\"/content/ComfyUI/output\"):\n",
148 | " \"\"\"Save images as WEBM using imageio.\"\"\"\n",
149 | " os.makedirs(output_dir, exist_ok=True)\n",
150 | " output_path = f\"{output_dir}/{filename_prefix}.webm\"\n",
151 | "\n",
152 | "\n",
153 | " frames = [(img.cpu().numpy() * 255).astype(np.uint8) for img in images]\n",
154 | "\n",
155 | "\n",
156 | " kwargs = {\n",
157 | " 'fps': int(fps),\n",
158 | " 'quality': int(quality),\n",
159 | " 'codec': str(codec),\n",
160 | " 'output_params': ['-crf', str(int(quality))]\n",
161 | " }\n",
162 | "\n",
163 | " with imageio.get_writer(\n",
164 | " output_path,\n",
165 | " format='FFMPEG',\n",
166 | " mode='I',\n",
167 | " **kwargs\n",
168 | " ) as writer:\n",
169 | " for frame in frames:\n",
170 | " writer.append_data(frame)\n",
171 | "\n",
172 | " return output_path\n",
173 | "\n",
174 | "def save_as_image(image, filename_prefix, output_dir=\"/content/ComfyUI/output\"):\n",
175 | " \"\"\"Save single frame as PNG image.\"\"\"\n",
176 | " os.makedirs(output_dir, exist_ok=True)\n",
177 | " output_path = f\"{output_dir}/{filename_prefix}.png\"\n",
178 | "\n",
179 | " frame = (image.cpu().numpy() * 255).astype(np.uint8)\n",
180 | "\n",
181 | " Image.fromarray(frame).save(output_path)\n",
182 | "\n",
183 | " return output_path\n",
184 | "\n",
185 | "def generate_video(\n",
186 | " positive_prompt: str = \"a fox moving quickly in a beautiful winter scenery nature trees mountains daytime tracking camera\",\n",
187 | " negative_prompt: str = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
188 | " width: int = 832,\n",
189 | " height: int = 480,\n",
190 | " seed: int = 82628696717253,\n",
191 | " steps: int = 30,\n",
192 | " cfg_scale: float = 1.0,\n",
193 | " sampler_name: str = \"uni_pc\",\n",
194 | " scheduler: str = \"simple\",\n",
195 | " frames: int = 33,\n",
196 | " fps: int = 16,\n",
197 | " output_format: str = \"mp4\"\n",
198 | "):\n",
199 | "\n",
200 | " with torch.inference_mode():\n",
201 | " print(\"Loading Text_Encoder...\")\n",
202 | " clip = clip_loader.load_clip(\"umt5_xxl_fp8_e4m3fn_scaled.safetensors\", \"wan\", \"default\")[0]\n",
203 | "\n",
204 | " positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
205 | " negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
206 | "\n",
207 | " del clip\n",
208 | " torch.cuda.empty_cache()\n",
209 | " gc.collect()\n",
210 | "\n",
211 | " empty_latent = empty_latent_video.generate(width, height, frames, 1)[0]\n",
212 | "\n",
213 | " print(\"Loading Unet Model...\")\n",
214 | " model = unet_loader.load_unet(\"wan2.1_t2v_1.3B_fp16.safetensors\", \"default\")[0]\n",
215 | " model = model_sampling.patch(model, 8)[0]\n",
216 | "\n",
217 | " print(\"Generating video...\")\n",
218 | " sampled = ksampler.sample(\n",
219 | " model=model,\n",
220 | " seed=seed,\n",
221 | " steps=steps,\n",
222 | " cfg=cfg_scale,\n",
223 | " sampler_name=sampler_name,\n",
224 | " scheduler=scheduler,\n",
225 | " positive=positive,\n",
226 | " negative=negative,\n",
227 | " latent_image=empty_latent\n",
228 | " )[0]\n",
229 | "\n",
230 | " del model\n",
231 | " torch.cuda.empty_cache()\n",
232 | " gc.collect()\n",
233 | "\n",
234 | " print(\"Loading VAE...\")\n",
235 | " vae = vae_loader.load_vae(\"wan_2.1_vae.safetensors\")[0]\n",
236 | "\n",
237 | " try:\n",
238 | " print(\"Decoding latents...\")\n",
239 | " decoded = vae_decode.decode(vae, sampled)[0]\n",
240 | "\n",
241 | " del vae\n",
242 | " torch.cuda.empty_cache()\n",
243 | " gc.collect()\n",
244 | "\n",
245 | " output_path = \"\"\n",
246 | " if frames == 1:\n",
247 | " # Single frame - save as image\n",
248 | " print(\"Single frame detected - saving as PNG image...\")\n",
249 | " output_path = save_as_image(decoded[0], \"ComfyUI\")\n",
250 | " # print(f\"Image saved as PNG: {output_path}\")\n",
251 | "\n",
252 | " # Display the image\n",
253 | " display(IPImage(filename=output_path))\n",
254 | " else:\n",
255 | " # # Multiple frames - save as video\n",
256 | " # if output_format.lower() == \"webp\":\n",
257 | " # print(\"Saving as WEBP...\")\n",
258 | " # save_webp.save_images(\n",
259 | " # images=decoded,\n",
260 | " # fps=fps,\n",
261 | " # filename_prefix=\"ComfyUI\",\n",
262 | " # lossless=False,\n",
263 | " # quality=90,\n",
264 | " # method=\"default\"\n",
265 | " # )\n",
266 | " # output_path = \"/content/ComfyUI/output/ComfyUI.webp\"\n",
267 | " # elif output_format.lower() == \"webm\":\n",
268 | " # print(\"Saving as WEBM...\")\n",
269 | " # save_webm.save_images(\n",
270 | " # images=decoded,\n",
271 | " # codec=\"vp9\",\n",
272 | " # fps=fps,\n",
273 | " # filename_prefix=\"ComfyUI\",\n",
274 | " # crf=32\n",
275 | " # )\n",
276 | " # output_path = \"/content/ComfyUI/output/ComfyUI.webm\"\n",
277 | " if output_format.lower() == \"webm\":\n",
278 | " print(\"Saving as WEBM...\")\n",
279 | " output_path = save_as_webm(\n",
280 | " decoded,\n",
281 | " \"ComfyUI\",\n",
282 | " fps=fps,\n",
283 | " codec=\"vp9\",\n",
284 | " quality=10\n",
285 | " )\n",
286 | " elif output_format.lower() == \"mp4\":\n",
287 | " print(\"Saving as MP4...\")\n",
288 | " output_path = save_as_mp4(decoded, \"ComfyUI\", fps)\n",
289 | " else:\n",
290 | " raise ValueError(f\"Unsupported output format: {output_format}\")\n",
291 | "\n",
292 | " # print(f\"Video saved as {output_format.upper()}: {output_path}\")\n",
293 | "\n",
294 | " display_video(output_path)\n",
295 | "\n",
296 | " except Exception as e:\n",
297 | " print(f\"Error during decoding/saving: {str(e)}\")\n",
298 | " raise\n",
299 | " finally:\n",
300 | " clear_memory()\n",
301 | "\n",
302 | "def display_video(video_path):\n",
303 | " from IPython.display import HTML\n",
304 | " from base64 import b64encode\n",
305 | "\n",
306 | " video_data = open(video_path,'rb').read()\n",
307 | "\n",
308 | " # Determine MIME type based on file extension\n",
309 | " if video_path.lower().endswith('.mp4'):\n",
310 | " mime_type = \"video/mp4\"\n",
311 | " elif video_path.lower().endswith('.webm'):\n",
312 | " mime_type = \"video/webm\"\n",
313 | " elif video_path.lower().endswith('.webp'):\n",
314 | " mime_type = \"image/webp\"\n",
315 | " else:\n",
316 | " mime_type = \"video/mp4\" # default\n",
317 | "\n",
318 | " data_url = f\"data:{mime_type};base64,\" + b64encode(video_data).decode()\n",
319 | "\n",
320 | " display(HTML(f\"\"\"\n",
321 | " \n",
324 | " \"\"\"))\n",
325 | "\n",
326 | "print(\"✅ Environment Setup Complete!\")\n",
327 | "\n"
328 | ],
329 | "metadata": {
330 | "cellView": "form",
331 | "id": "rrXFIT4fMfyJ"
332 | },
333 | "execution_count": null,
334 | "outputs": []
335 | },
336 | {
337 | "cell_type": "code",
338 | "source": [
339 | "# @title Generate Video/Image\n",
340 | "positive_prompt = \"a fox moving quickly in a beautiful winter scenery nature trees mountains daytime tracking camera\" # @param {\"type\":\"string\"}\n",
341 | "negative_prompt = \"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\" # @param {\"type\":\"string\"}\n",
342 | "width = 832 # @param {\"type\":\"number\"}\n",
343 | "height = 480 # @param {\"type\":\"number\"}\n",
344 | "seed = 82628696717253 # @param {\"type\":\"integer\"}\n",
345 | "steps = 30 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
346 | "cfg_scale = 3 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
347 | "sampler_name = \"uni_pc\" # @param [\"uni_pc\", \"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
348 | "scheduler = \"simple\" # @param [\"simple\", \"normal\", \"karras\", \"exponential\"]\n",
349 | "frames = 33 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
350 | "fps = 16 # @param {\"type\":\"integer\", \"min\":1, \"max\":60}\n",
351 | "output_format = \"mp4\" # @param [\"mp4\", \"webm\"]\n",
352 | "\n",
353 | "# with torch.inference_mode():\n",
354 | "generate_video(\n",
355 | " positive_prompt=positive_prompt,\n",
356 | " negative_prompt=negative_prompt,\n",
357 | " width=width,\n",
358 | " height=height,\n",
359 | " seed=seed,\n",
360 | " steps=steps,\n",
361 | " cfg_scale=cfg_scale,\n",
362 | " sampler_name=sampler_name,\n",
363 | " scheduler=scheduler,\n",
364 | " frames=frames,\n",
365 | " fps=fps,\n",
366 | " output_format=output_format\n",
367 | ")\n",
368 | "clear_memory()"
369 | ],
370 | "metadata": {
371 | "cellView": "form",
372 | "id": "roC59_oNNflb"
373 | },
374 | "execution_count": null,
375 | "outputs": []
376 | }
377 | ]
378 | }
--------------------------------------------------------------------------------
/Wan2_1_I2V_14B.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "A100"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "**PREPARE ENVIRONMENT**"
23 | ],
24 | "metadata": {
25 | "id": "WkDRjQexxNJL"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "- You can set **parameters_in_vram** to 10 or less to reduce the VRAM used, especially if you want to generate a video with a resolution greater than 480 by 832 or 832 by 480. The VRAM used depends on the value of **parameters_in_vram** and the resolutions of the input image & output video.\n",
32 | "- Setting **parameters_in_vram** to 15 with a video resolution of 480 by 832 will result in a generation time of approximately 10 minutes. Lower values of **parameters_in_vram** will result in longer generation times. Higher values can reduce the video generation time, but increase the risk of getting **Out of Memory Error**s."
33 | ],
34 | "metadata": {
35 | "id": "wC_tqstYe52w"
36 | }
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": true,
43 | "cellView": "form",
44 | "id": "X2XHqPAaJSJl"
45 | },
46 | "outputs": [],
47 | "source": [
48 | "# @title\n",
49 | "!git clone https://github.com/Isi-dev/DiffSynth-Studio.git\n",
50 | "%cd DiffSynth-Studio\n",
51 | "!pip install -e .\n",
52 | "!pip install \"huggingface_hub[cli]\"\n",
53 | "!apt-get install -y aria2\n",
54 | "import os\n",
55 | "from huggingface_hub import list_repo_files\n",
56 | "\n",
57 | "repo_id = \"Isi99999/Wan2.1-I2V-14B-480P\"\n",
58 | "all_files = list_repo_files(repo_id)\n",
59 | "base_url = f\"https://huggingface.co/{repo_id}/resolve/main/\"\n",
60 | "\n",
61 | "with open(\"file_list.txt\", \"w\") as f:\n",
62 | " for file_path in all_files:\n",
63 | " full_url = f\"{base_url}{file_path}\"\n",
64 | " save_path = f\"models/Wan-AI/Wan2.1-I2V-14B-480P/{file_path}\"\n",
65 | " os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
66 | " f.write(f\"{full_url}\\n out={save_path}\\n\")\n",
67 | "!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false\n",
68 | "\n",
69 | "print(\"✅ All models downloaded successfully!\")\n",
70 | "\n",
71 | "import torch\n",
72 | "from diffsynth import ModelManager, WanVideoPipeline, VideoData, save_video\n",
73 | "\n",
74 | "model_manager = ModelManager(device=\"cpu\")\n",
75 | "model_manager.load_models(\n",
76 | " [\"models/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth\"],\n",
77 | " torch_dtype=torch.float16, # Image Encoder is loaded with float16\n",
78 | ")\n",
79 | "model_manager.load_models(\n",
80 | " [\n",
81 | " \"models/Wan-AI/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors\",\n",
82 | " \"models/Wan-AI/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.safetensors\",\n",
83 | " \"models/Wan-AI/Wan2.1-I2V-14B-480P/Wan2.1_VAE.safetensors\",\n",
84 | " ],\n",
85 | " torch_dtype=torch.torch.bfloat16, # You can set `torch_dtype=torch.float8_e4m3fn` or `torch_dtype=torch.bfloat16` to disable FP8 quantization.\n",
86 | ")\n",
87 | "\n",
88 | "pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device=\"cuda\")\n",
89 | "parameters_in_vram = 14 # @param {\"type\":\"number\"}\n",
90 | "pipe.enable_vram_management(num_persistent_param_in_dit=parameters_in_vram*10**9) # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.\n",
91 | "print(\"✅ All models loaded successfully!\")"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "source": [
97 | "**RUN TO UPLOAD IMAGE**"
98 | ],
99 | "metadata": {
100 | "id": "v6g0SqdGaKAr"
101 | }
102 | },
103 | {
104 | "cell_type": "code",
105 | "source": [
106 | "# @title\n",
107 | "from google.colab import files\n",
108 | "from PIL import Image\n",
109 | "\n",
110 | "uploaded = files.upload()\n",
111 | "image_path = list(uploaded.keys())[0]\n",
112 | "image = Image.open(image_path)\n",
113 | "\n",
114 | "print(\"✅Image loaded successfully:\", image.size)\n",
115 | "\n"
116 | ],
117 | "metadata": {
118 | "cellView": "form",
119 | "id": "_jBdEI7BS7cL"
120 | },
121 | "execution_count": null,
122 | "outputs": []
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "source": [
127 | "**RUN IMAGE TO VIDEO**"
128 | ],
129 | "metadata": {
130 | "id": "S3zCgOnBaO4h"
131 | }
132 | },
133 | {
134 | "cell_type": "code",
135 | "source": [
136 | "\n",
137 | "\n",
138 | "prompt = \"The lady smiles and waves at the camera.\" # @param {type:\"string\"}\n",
139 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
140 | "Instruction = \"choose from '720*1280', '1280*720', '480*832', '832*480', '1024*1024 for output video's width & height.\" # @param {\"type\":\"string\"}\n",
141 | "width = 480 # @param {\"type\":\"number\"}\n",
142 | "height = 832 # @param {\"type\":\"number\"}\n",
143 | "num_frames = 81 # @param {\"type\":\"number\"}\n",
144 | "seed = 1 # @param {\"type\":\"number\"}\n",
145 | "\n",
146 | "# Generate video from text prompt and Image\n",
147 | "video = pipe(\n",
148 | " prompt=prompt,\n",
149 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
150 | " input_image=image,\n",
151 | " height = height,\n",
152 | " width = width,\n",
153 | " num_frames=num_frames,\n",
154 | " num_inference_steps=sample_steps,\n",
155 | " seed=seed, tiled=True\n",
156 | ")\n",
157 | "\n",
158 | "# # Save the generated video\n",
159 | "save_video(video, \"video.mp4\", fps=15, quality=5)\n",
160 | "\n",
161 | "from IPython.display import display as displayVid, Video as outVid\n",
162 | "import os\n",
163 | "\n",
164 | "# Function to display video\n",
165 | "def show_video(video_path):\n",
166 | " if os.path.exists(video_path):\n",
167 | " displayVid(outVid(video_path, embed=True))\n",
168 | " else:\n",
169 | " print(f\"Error: {video_path} not found!\")\n",
170 | "\n",
171 | "# Show the video\n",
172 | "show_video(\"video.mp4\")"
173 | ],
174 | "metadata": {
175 | "cellView": "form",
176 | "id": "6CyP7yjoMVn9"
177 | },
178 | "execution_count": null,
179 | "outputs": []
180 | }
181 | ]
182 | }
--------------------------------------------------------------------------------
/Wan2_1_T2V_14B.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "A100"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "**Note**: Downloading and loading the models takes 3 minutes 30 seconds to 4 minutes.\n",
23 | "\n",
24 | "For generating a 5-second video with 30 steps:\n",
25 | "\n",
26 | "-480×832 resolution\n",
27 | "\n",
28 | "- L4 GPU: ~40 minutes 10 seconds\n",
29 | "- A100 GPU: ~10 minutes 24 seconds (faster and potentially more cost-effective)\n",
30 | "\n",
31 | "-720×1280 resolution\n",
32 | "\n",
33 | "- A100 GPU: ~34 minutes"
34 | ],
35 | "metadata": {
36 | "id": "RKfyDLMBMWqc"
37 | }
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "source": [
42 | "**DOWNLOAD AND LOAD LIBRARIES & MODELS**"
43 | ],
44 | "metadata": {
45 | "id": "O7QKd93FL2aF"
46 | }
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "collapsed": true,
53 | "cellView": "form",
54 | "id": "fu5UnKb8JB3Z"
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# @title\n",
59 | "!git clone https://github.com/Isi-dev/DiffSynth-Studio.git\n",
60 | "%cd DiffSynth-Studio\n",
61 | "!pip install -e .\n",
62 | "!pip install \"huggingface_hub[cli]\"\n",
63 | "!apt-get install -y aria2\n",
64 | "import os\n",
65 | "from huggingface_hub import list_repo_files\n",
66 | "\n",
67 | "repo_id = \"Isi99999/Wan2.1-T2V-14B\"\n",
68 | "# repo_id = \"Wan-AI/Wan2.1-T2V-14B\"\n",
69 | "all_files = list_repo_files(repo_id)\n",
70 | "base_url = f\"https://huggingface.co/{repo_id}/resolve/main/\"\n",
71 | "\n",
72 | "with open(\"file_list.txt\", \"w\") as f:\n",
73 | " for file_path in all_files:\n",
74 | " full_url = f\"{base_url}{file_path}\"\n",
75 | " save_path = f\"models/Wan-AI/Wan2.1-T2V-14B/{file_path}\"\n",
76 | " os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
77 | " f.write(f\"{full_url}\\n out={save_path}\\n\")\n",
78 | "!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false\n",
79 | "\n",
80 | "print(\"✅ All models downloaded successfully!\")\n",
81 | "\n",
82 | "import torch\n",
83 | "from diffsynth import ModelManager, WanVideoPipeline\n",
84 | "\n",
85 | "# Initialize model manager and load the model\n",
86 | "model_manager = ModelManager(device=\"cpu\")\n",
87 | "model_manager.load_models(\n",
88 | " [\n",
89 | " \"models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model.safetensors\",\n",
90 | " \"models/Wan-AI/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.safetensors\",\n",
91 | " \"models/Wan-AI/Wan2.1-T2V-14B/Wan2.1_VAE.pth\",\n",
92 | " ],\n",
93 | " torch_dtype=torch.float8_e4m3fn # You can set `torch.float8_e4m3fn` or `torch_dtype=torch.bfloat16` to disable FP8 quantization.\n",
94 | ")\n",
95 | "\n",
96 | "# Initialize the video pipeline\n",
97 | "pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device=\"cuda\")\n",
98 | "pipe.enable_vram_management(num_persistent_param_in_dit=None)\n",
99 | "print(\"✅ All models loaded successfully!\")\n",
100 | "from diffsynth import save_video\n",
101 | "from diffsynth import VideoData"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "source": [
107 | "**RUN TEXT TO VIDEO**"
108 | ],
109 | "metadata": {
110 | "id": "gjGfMPZWMC_s"
111 | }
112 | },
113 | {
114 | "cell_type": "code",
115 | "source": [
116 | "\n",
117 | "\n",
118 | "prompt = \"A highly detailed, realistic AI-generated portrait of a very beautiful female soldier representing China. She has long hair, a confident and friendly smile, and striking facial features. She is wearing a camouflage military uniform with an open front, revealing her huge cleavage. She holds a modern assault rifle in a relaxed yet ready position. She walks towards the camera as the camera moves back to track her movements.The background shows a slightly blurred battlefield with other soldiers in formation, creating a sense of military action. The Chinese flag is displayed on her uniform on her shoulder. The lighting is natural, with a warm and slightly cinematic tone. The image should have a sharp focus on her face and outfit while maintaining a professional military aesthetic.\" # @param {type:\"string\"}\n",
119 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
120 | "Instruction = \"choose from '720*1280', '1280*720', '480*832', '832*480', '1024*1024 for Width & Height\" # @param {\"type\":\"string\"}\n",
121 | "width = 480 # @param {\"type\":\"number\"}\n",
122 | "height = 832 # @param {\"type\":\"number\"}\n",
123 | "# num_frames = 1 # @param {\"type\":\"number\"}\n",
124 | "seed = 1 # @param {\"type\":\"number\"}\n",
125 | "\n",
126 | "# Generate video from text prompt\n",
127 | "video = pipe(\n",
128 | " prompt=prompt,\n",
129 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
130 | " height = height,\n",
131 | " width = width,\n",
132 | " num_frames=81,\n",
133 | " num_inference_steps=sample_steps,\n",
134 | " seed=seed, tiled=True\n",
135 | ")\n",
136 | "\n",
137 | "# # Save the generated video\n",
138 | "save_video(video, \"video1.mp4\", fps=15, quality=5)\n",
139 | "\n",
140 | "from IPython.display import display as displayVid, Video as outVid\n",
141 | "import os\n",
142 | "\n",
143 | "# Function to display video\n",
144 | "def show_video(video_path):\n",
145 | " if os.path.exists(video_path):\n",
146 | " displayVid(outVid(video_path, embed=True))\n",
147 | " else:\n",
148 | " print(f\"Error: {video_path} not found!\")\n",
149 | "\n",
150 | "# Show the video\n",
151 | "show_video(\"video1.mp4\")\n"
152 | ],
153 | "metadata": {
154 | "cellView": "form",
155 | "collapsed": true,
156 | "id": "JqWO3tpMKvXv"
157 | },
158 | "execution_count": null,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "source": [
164 | "**RUN TEXT TO IMAGE**"
165 | ],
166 | "metadata": {
167 | "id": "kAmUnHI3Zzaz"
168 | }
169 | },
170 | {
171 | "cell_type": "code",
172 | "source": [
173 | "\n",
174 | "\n",
175 | "prompt = \"A highly detailed, realistic AI-generated portrait of a very beautiful female soldier representing Canada. She has long hair, a confident and friendly smile, and striking facial features. She is wearing a camouflage military uniform with an open front, revealing her huge cleavage. She holds a modern assault rifle in a relaxed yet ready position. The background shows a slightly blurred battlefield with other soldiers in formation, creating a sense of military action. The Canadian flag is displayed on her uniform on her shoulder. The lighting is natural, with a warm and slightly cinematic tone. The image should have a sharp focus on her face and outfit while maintaining a professional military aesthetic.\" # @param {type:\"string\"}\n",
176 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
177 | "Instruction = \"choose from '720*1280', '1280*720', '480*832', '832*480', '1024*1024 for Width & Height\" # @param {\"type\":\"string\"}\n",
178 | "width = 720 # @param {\"type\":\"number\"}\n",
179 | "height = 1280 # @param {\"type\":\"number\"}\n",
180 | "seed = 1 # @param {\"type\":\"number\"}\n",
181 | "displayWidth = 480 # @param {\"type\":\"number\"}\n",
182 | "\n",
183 | "# Generate video from text prompt\n",
184 | "video = pipe(\n",
185 | " prompt=prompt,\n",
186 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
187 | " height = height,\n",
188 | " width = width,\n",
189 | " num_frames=1,\n",
190 | " num_inference_steps=sample_steps,\n",
191 | " seed=seed, tiled=True\n",
192 | ")\n",
193 | "\n",
194 | "\n",
195 | "# Save the generated video\n",
196 | "save_video(video, \"video2.mp4\", fps=15, quality=5)\n",
197 | "\n",
198 | "import cv2\n",
199 | "import os\n",
200 | "from IPython.display import display as displayVid, Image as outImg\n",
201 | "\n",
202 | "def show_image(video_path, display_width=480):\n",
203 | " if not os.path.exists(video_path):\n",
204 | " print(f\"Error: {video_path} not found!\")\n",
205 | " return\n",
206 | "\n",
207 | " cap = cv2.VideoCapture(video_path)\n",
208 | " success, frame = cap.read()\n",
209 | " cap.release()\n",
210 | "\n",
211 | " if success:\n",
212 | " image_path = \"single_frame.png\"\n",
213 | " cv2.imwrite(image_path, frame)\n",
214 | " displayVid(outImg(image_path, width=display_width))\n",
215 | " else:\n",
216 | " print(\"Error: Could not read the frame.\")\n",
217 | "\n",
218 | "show_image(\"video2.mp4\", display_width=displayWidth)\n"
219 | ],
220 | "metadata": {
221 | "cellView": "form",
222 | "id": "i7-Pg-CuZ2er"
223 | },
224 | "execution_count": null,
225 | "outputs": []
226 | }
227 | ]
228 | }
--------------------------------------------------------------------------------
/Wan2_1_T2V_1_3B_DiffSynth.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "L4"
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "source": [
22 | "**Note**: You will need at least the L4 which comes with a RAM above 50GB to run this because of the text encoder checkpoint which causes a memory spike in the RAM up to 22GB during loading. This will crash the free T4 which is provided with a RAM of just 12.7GB."
23 | ],
24 | "metadata": {
25 | "id": "RXjAHic0nJjc"
26 | }
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "source": [
31 | "**DOWNLOAD LIBRARIES & MODELS**"
32 | ],
33 | "metadata": {
34 | "id": "Q17ff9G1j_bL"
35 | }
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {
41 | "cellView": "form",
42 | "id": "9-CAvpGRJf23"
43 | },
44 | "outputs": [],
45 | "source": [
46 | "# @title\n",
47 | "!git clone https://github.com/Isi-dev/DiffSynth-Studio.git\n",
48 | "%cd DiffSynth-Studio\n",
49 | "!pip install -e .\n",
50 | "!pip install \"huggingface_hub[cli]\"\n",
51 | "!apt-get install -y aria2\n",
52 | "import os\n",
53 | "from huggingface_hub import list_repo_files\n",
54 | "\n",
55 | "repo_id = \"Wan-AI/Wan2.1-T2V-1.3B\"\n",
56 | "all_files = list_repo_files(repo_id)\n",
57 | "base_url = f\"https://huggingface.co/{repo_id}/resolve/main/\"\n",
58 | "\n",
59 | "with open(\"file_list.txt\", \"w\") as f:\n",
60 | " for file_path in all_files:\n",
61 | " full_url = f\"{base_url}{file_path}\"\n",
62 | " save_path = f\"models/Wan-AI/Wan2.1-T2V-1.3B/{file_path}\"\n",
63 | " os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
64 | " f.write(f\"{full_url}\\n out={save_path}\\n\")\n",
65 | "!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false\n",
66 | "\n",
67 | "print(\"✅ All models downloaded successfully!\")\n",
68 | "\n",
69 | "import torch\n",
70 | "from diffsynth import ModelManager, WanVideoPipeline\n",
71 | "\n",
72 | "# Initialize model manager and load the model\n",
73 | "model_manager = ModelManager(device=\"cuda\")\n",
74 | "model_manager.load_models(\n",
75 | " [\n",
76 | " \"models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors\",\n",
77 | " \"models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth\",\n",
78 | " \"models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth\",\n",
79 | " ],\n",
80 | " torch_dtype=torch.bfloat16 # Use torch.float8_e4m3fn for FP8 quantization if needed\n",
81 | ")\n",
82 | "\n",
83 | "# Initialize the video pipeline\n",
84 | "pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device=\"cuda\")\n",
85 | "pipe.enable_vram_management(num_persistent_param_in_dit=None)\n",
86 | "print(\"✅ All models loaded successfully!\")\n",
87 | "from diffsynth import save_video\n",
88 | "from diffsynth import VideoData\n"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "source": [
94 | "**RUN TEXT TO VIDEO**"
95 | ],
96 | "metadata": {
97 | "id": "FpnKloqwkPOg"
98 | }
99 | },
100 | {
101 | "cell_type": "code",
102 | "source": [
103 | "prompt = \"A 25 years old blonde walking in the street.\" # @param {type:\"string\"}\n",
104 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
105 | "Instruction = \"choose from '480*832' & '832*480' for Width & Height\" # @param {\"type\":\"string\"}\n",
106 | "width = 480 # @param {\"type\":\"number\"}\n",
107 | "height = 832 # @param {\"type\":\"number\"}\n",
108 | "seed = 1 # @param {\"type\":\"number\"}\n",
109 | "\n",
110 | "# Generate video from text prompt\n",
111 | "video = pipe(\n",
112 | " prompt=prompt,\n",
113 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
114 | " height = height,\n",
115 | " width = width,\n",
116 | " num_inference_steps=sample_steps,\n",
117 | " seed=seed, tiled=True\n",
118 | ")\n",
119 | "\n",
120 | "# Save the generated video\n",
121 | "save_video(video, \"video1.mp4\", fps=15, quality=5)\n",
122 | "\n",
123 | "from IPython.display import display as displayVid, Video as outVid\n",
124 | "import os\n",
125 | "\n",
126 | "# Function to display video\n",
127 | "def show_video(video_path):\n",
128 | " if os.path.exists(video_path):\n",
129 | " displayVid(outVid(video_path, embed=True))\n",
130 | " else:\n",
131 | " print(f\"Error: {video_path} not found!\")\n",
132 | "\n",
133 | "# Show the video\n",
134 | "show_video(\"video1.mp4\")\n"
135 | ],
136 | "metadata": {
137 | "collapsed": true,
138 | "cellView": "form",
139 | "id": "UY1xCtz1OqSt"
140 | },
141 | "execution_count": null,
142 | "outputs": []
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "source": [
147 | "**RUN TEXT TO IMAGE**"
148 | ],
149 | "metadata": {
150 | "id": "z92WKT5b-1et"
151 | }
152 | },
153 | {
154 | "cell_type": "code",
155 | "source": [
156 | "prompt = \"A highly detailed, realistic AI-generated portrait of a very beautiful female soldier representing Canada. She has long hair, a confident and friendly smile, and striking facial features. She is wearing a camouflage military uniform with an open front, revealing her huge cleavage. She holds a modern assault rifle in a relaxed yet ready position. The background shows a slightly blurred battlefield with other soldiers in formation, creating a sense of military action. The Canadian flag is displayed on her uniform on her shoulder. The lighting is natural, with a warm and slightly cinematic tone. The image should have a sharp focus on her face and outfit while maintaining a professional military aesthetic.\" # @param {type:\"string\"}\n",
157 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
158 | "Instruction = \"choose from '720*1280', '1280*720', '480*832', '832*480', '1024*1024 for Width & Height\" # @param {\"type\":\"string\"}\n",
159 | "width = 720 # @param {\"type\":\"number\"}\n",
160 | "height = 1280 # @param {\"type\":\"number\"}\n",
161 | "seed = 48 # @param {\"type\":\"number\"}\n",
162 | "displayWidth = 480 # @param {\"type\":\"number\"}\n",
163 | "\n",
164 | "# Generate video from text prompt\n",
165 | "video = pipe(\n",
166 | " prompt=prompt,\n",
167 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
168 | " height = height,\n",
169 | " width = width,\n",
170 | " num_frames=1,\n",
171 | " num_inference_steps=sample_steps,\n",
172 | " seed=seed, tiled=True\n",
173 | ")\n",
174 | "\n",
175 | "# Save the generated video\n",
176 | "save_video(video, \"vid.mp4\", fps=15, quality=5)\n",
177 | "\n",
178 | "import cv2\n",
179 | "import os\n",
180 | "from IPython.display import display as displayVid, Image as outImg\n",
181 | "\n",
182 | "def show_image(video_path, display_width=480):\n",
183 | " if not os.path.exists(video_path):\n",
184 | " print(f\"Error: {video_path} not found!\")\n",
185 | " return\n",
186 | "\n",
187 | " cap = cv2.VideoCapture(video_path)\n",
188 | " success, frame = cap.read()\n",
189 | " cap.release()\n",
190 | "\n",
191 | " if success:\n",
192 | " image_path = \"single_frame.png\"\n",
193 | " cv2.imwrite(image_path, frame)\n",
194 | " displayVid(outImg(image_path, width=display_width))\n",
195 | " else:\n",
196 | " print(\"Error: Could not read the frame.\")\n",
197 | "\n",
198 | "show_image(\"vid.mp4\", display_width=displayWidth)\n"
199 | ],
200 | "metadata": {
201 | "cellView": "form",
202 | "id": "SRIWM1Ir-6R8"
203 | },
204 | "execution_count": null,
205 | "outputs": []
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "source": [
210 | "**RUN VIDEO TO VIDEO**(Experimental)"
211 | ],
212 | "metadata": {
213 | "id": "d8pCP7AEkWDy"
214 | }
215 | },
216 | {
217 | "cell_type": "code",
218 | "source": [
219 | "prompt = \"The woman wears a crown.\" # @param {type:\"string\"}\n",
220 | "sample_steps = 30 # @param {\"type\":\"number\"}\n",
221 | "Instruction = \"choose from '480*832' & '832*480' for Width & Height\" # @param {\"type\":\"string\"}\n",
222 | "width = 480 # @param {\"type\":\"number\"}\n",
223 | "height = 832 # @param {\"type\":\"number\"}\n",
224 | "seed = 1 # @param {\"type\":\"number\"}\n",
225 | "denoising_strength = 0.7 # @param {\"type\":\"number\"}\n",
226 | "upload_a_video = False # @param {\"type\":\"boolean\"}\n",
227 | "\n",
228 | "if upload_a_video:\n",
229 | " from google.colab import files\n",
230 | " import cv2\n",
231 | "\n",
232 | " # Upload video\n",
233 | " uploaded = files.upload()\n",
234 | " video_path = list(uploaded.keys())[0] # Get uploaded file name (video path)\n",
235 | "\n",
236 | " # Check if the video was uploaded correctly\n",
237 | " if not video_path:\n",
238 | " print(\"Error: No video uploaded.\")\n",
239 | " else:\n",
240 | " print(f\"Successfully uploaded: {video_path}\")\n",
241 | "\n",
242 | " # Use the uploaded video file path directly for VideoData\n",
243 | " video = VideoData(video_path, height=height, width=width)\n",
244 | "else:\n",
245 | " # Load the previously generated video\n",
246 | " video = VideoData(\"video1.mp4\", height=height, width=width)\n",
247 | "\n",
248 | "# Modify the video with a new prompt\n",
249 | "video = pipe(\n",
250 | " prompt=prompt,\n",
251 | " negative_prompt=\"色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走\",\n",
252 | " input_video=video, denoising_strength=denoising_strength,\n",
253 | " height = height,\n",
254 | " width = width,\n",
255 | " num_inference_steps=sample_steps,\n",
256 | " seed=seed, tiled=True\n",
257 | ")\n",
258 | "\n",
259 | "# Save the modified video\n",
260 | "save_video(video, \"video2.mp4\", fps=15, quality=5)\n",
261 | "\n",
262 | "# Function to display video\n",
263 | "def show_video(video_path):\n",
264 | " if os.path.exists(video_path):\n",
265 | " displayVid(outVid(video_path, embed=True))\n",
266 | " else:\n",
267 | " print(f\"Error: {video_path} not found!\")\n",
268 | "\n",
269 | "# Show the video\n",
270 | "show_video(\"video2.mp4\")\n"
271 | ],
272 | "metadata": {
273 | "cellView": "form",
274 | "id": "PXUDq-uiMPkC"
275 | },
276 | "execution_count": null,
277 | "outputs": []
278 | }
279 | ]
280 | }
--------------------------------------------------------------------------------
/ZonosTTS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "**PREPARE ENVIRONMENT**"
7 | ],
8 | "metadata": {
9 | "id": "VPMGP0sNHzmm"
10 | }
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "id": "5qyNaUoB-aVd",
17 | "collapsed": true,
18 | "cellView": "form"
19 | },
20 | "outputs": [],
21 | "source": [
22 | "# @title\n",
23 | "!apt update && apt install -y espeak-ng\n",
24 | "!git clone https://github.com/Isi-dev/Zonos.git\n",
25 | "%cd Zonos\n",
26 | "!pip install -e .\n",
27 | "# !pip install --no-build-isolation -e .[compile] # optional but needed to run the hybrid\n",
28 | "\n",
29 | "import torch\n",
30 | "import torchaudio\n",
31 | "from zonos.model import Zonos\n",
32 | "from zonos.conditioning import make_cond_dict\n",
33 | "\n",
34 | "# Check device\n",
35 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
36 | "print(f\"Using device: {device}\")\n",
37 | "\n",
38 | "# Load model\n",
39 | "print(\"Loading model...\")\n",
40 | "model = Zonos.from_pretrained(\"Isi99999/Zonos-v0.1-transformer\", device=device)\n",
41 | "print(\"Model loaded!\")"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "source": [
47 | "**UPLOAD 10 TO 30 SECONDS REFERENCE VOICE AUDIO** (optional)"
48 | ],
49 | "metadata": {
50 | "id": "CBzi8_NrMMeh"
51 | }
52 | },
53 | {
54 | "cell_type": "code",
55 | "source": [
56 | "# @title\n",
57 | "import os\n",
58 | "from google.colab import files\n",
59 | "\n",
60 | "os.environ[\"LC_ALL\"] = \"C.UTF-8\"\n",
61 | "os.environ[\"LANG\"] = \"C.UTF-8\"\n",
62 | "os.makedirs(\"assets\", exist_ok=True)\n",
63 | "\n",
64 | "uploaded = files.upload()\n",
65 | "for filename in uploaded.keys():\n",
66 | " new_path = \"assets/reference.mp3\"\n",
67 | " if os.path.exists(new_path):\n",
68 | " os.remove(new_path)\n",
69 | " os.rename(filename, new_path) # Rename safely\n",
70 | "\n",
71 | "print(\"Loading reference audio...\")\n",
72 | "wav, sampling_rate = torchaudio.load(\"assets/reference.mp3\")\n",
73 | "speaker = model.make_speaker_embedding(wav, sampling_rate)\n",
74 | "print(\"Reference audio loaded!\")"
75 | ],
76 | "metadata": {
77 | "id": "sfn6z67Zmy9s",
78 | "collapsed": true,
79 | "cellView": "form"
80 | },
81 | "execution_count": null,
82 | "outputs": []
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "source": [
87 | "**ENTER TEXT, ADJUST SETTINGS & RUN**"
88 | ],
89 | "metadata": {
90 | "id": "CjIRWivlMY8p"
91 | }
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {
97 | "id": "8NOUeS9gL9fZ",
98 | "cellView": "form",
99 | "collapsed": true
100 | },
101 | "outputs": [],
102 | "source": [
103 | "text = \" I am motivated by the simple yet profound joys of being alive—the taste of a good meal, the laughter of a friend, the beauty of a sunrise, and the endless pursuit of knowledge. Even if everything about me ceases when I die, my actions, words, and ideas can leave ripples in the world, affecting others in ways I may never fully grasp. \" # @param {type:\"string\"}\n",
104 | "seed = 421 # @param {\"type\":\"number\"}\n",
105 | "use_default_speaker = True # @param {type:\"boolean\"}\n",
106 | "language = 'en-us' # @param ['af', 'am', 'an', 'ar', 'as', 'az', 'ba', 'bg', 'bn', 'bpy', 'bs', 'ca', 'cmn', 'cs', 'cy', 'da', 'de', 'el', 'en-029', 'en-gb', 'en-gb-scotland', 'en-gb-x-gbclan', 'en-gb-x-gbcwmd', 'en-gb-x-rp', 'en-us', 'eo', 'es', 'es-419', 'et', 'eu', 'fa', 'fa-latn', 'fi', 'fr-be', 'fr-ch', 'fr-fr', 'ga', 'gd', 'gn', 'grc', 'gu', 'hak', 'hi', 'hr', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'is', 'it', 'ja', 'jbo', 'ka', 'kk', 'kl', 'kn', 'ko', 'kok', 'ku', 'ky', 'la', 'lfn', 'lt', 'lv', 'mi', 'mk', 'ml', 'mr', 'ms', 'mt', 'my', 'nb', 'nci', 'ne', 'nl', 'om', 'or', 'pa', 'pap', 'pl', 'pt', 'pt-br', 'py', 'quc', 'ro', 'ru', 'ru-lv', 'sd', 'shn', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'tn', 'tr', 'tt', 'ur', 'uz', 'vi', 'vi-vn-x-central', 'vi-vn-x-south', 'yue']\n",
107 | "happy = 0.3077 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
108 | "sad = 0.0256 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
109 | "disgust = 0.0256 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
110 | "fear = 0.0256 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
111 | "surprise = 0.0256 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
112 | "anger = 0.0256 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
113 | "other = 0.2564 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
114 | "neutral = 0.3077 # @param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
115 | "pitch = 20 # @param {type:\"slider\", min:0, max:400, step:1}\n",
116 | "speed = 15 # @param {type:\"slider\", min:0.0, max:40.0, step:1.0}\n",
117 | "\n",
118 | "\n",
119 | "total = happy + sad + disgust + fear + surprise + anger + other + neutral\n",
120 | "if total > 0:\n",
121 | " happy = happy / total\n",
122 | " sad = sad / total\n",
123 | " disgust = disgust / total\n",
124 | " fear = fear / total\n",
125 | " surprise = surprise / total\n",
126 | " anger = anger / total\n",
127 | " other = other / total\n",
128 | " neutral = neutral / total\n",
129 | "\n",
130 | "emotions = torch.tensor(list(map(float, [happy, sad, disgust, fear, surprise, anger, other, neutral])), device=device)\n",
131 | "\n",
132 | "if use_default_speaker:\n",
133 | " print(\"Loading default audio...\")\n",
134 | " wav, sampling_rate = torchaudio.load(\"assets/exampleaudio.mp3\")\n",
135 | " speaker = model.make_speaker_embedding(wav, sampling_rate)\n",
136 | " print(\"Default audio loaded!\")\n",
137 | "\n",
138 | "\n",
139 | "def generate_speech2( text, seed = 421, language=\"en-us\", emotion_tensor= torch.tensor(list(map(float, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])), device=device), pitch= 20, speed= 15):\n",
140 | " \"\"\"Generate speech from text\"\"\"\n",
141 | " print(f\"Generating: {text}\")\n",
142 | "\n",
143 | " if seed >= 0:\n",
144 | " torch.manual_seed(seed)\n",
145 | " else:\n",
146 | " torch.random.seed()\n",
147 | "\n",
148 | " # Create conditioning\n",
149 | " cond_dict = make_cond_dict(\n",
150 | " text=text,\n",
151 | " language=language,\n",
152 | " speaker=speaker,\n",
153 | " emotion=emotion_tensor,\n",
154 | " pitch_std = pitch,\n",
155 | " speaking_rate=speed\n",
156 | "\n",
157 | " )\n",
158 | " conditioning = model.prepare_conditioning(cond_dict)\n",
159 | "\n",
160 | " # Generate audio\n",
161 | " codes = model.generate(conditioning)\n",
162 | " wavs = model.autoencoder.decode(codes).cpu()\n",
163 | "\n",
164 | " # Save and play\n",
165 | " filename = \"output.wav\"\n",
166 | " torchaudio.save(filename, wavs[0], model.autoencoder.sampling_rate)\n",
167 | " return filename\n",
168 | "\n",
169 | "output_file = generate_speech2(text, seed = seed, language=language, emotion_tensor= emotions, pitch = pitch, speed = speed)\n",
170 | "from IPython.display import Audio\n",
171 | "Audio(output_file)"
172 | ]
173 | }
174 | ],
175 | "metadata": {
176 | "accelerator": "GPU",
177 | "colab": {
178 | "gpuType": "T4",
179 | "provenance": []
180 | },
181 | "kernelspec": {
182 | "display_name": "Python 3",
183 | "name": "python3"
184 | },
185 | "language_info": {
186 | "name": "python"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 0
191 | }
--------------------------------------------------------------------------------