├── .gitattributes ├── .github ├── FUNDING.yml └── workflows │ └── publish.yml ├── .gitignore ├── LICENSE ├── __init__.py ├── cogvideo_controlnet.py ├── cogvideox_fun └── utils.py ├── configs ├── scheduler_config_2b.json ├── scheduler_config_5b.json ├── transformer_config_2b.json ├── transformer_config_5b.json ├── transformer_config_I2V_5b.json └── vae_config.json ├── context.py ├── custom_cogvideox_transformer_3d.py ├── embeddings.py ├── enhance_a_video ├── __init__.py ├── enhance.py └── globals.py ├── example_workflows ├── cogvideox_1.0_5b_vid2vid_02.json ├── cogvideox_1_0_2b_controlnet_02.json ├── cogvideox_1_0_5b_I2V_02.json ├── cogvideox_1_0_5b_I2V_Tora_02.json ├── cogvideox_1_0_5b_I2V_noise_warp_01.json ├── cogvideox_1_0_5b_T2V_02.json ├── cogvideox_1_0_5b_interpolation_02.json ├── cogvideox_1_0_5b_vid2vid_02.json ├── cogvideox_1_5_5b_I2V_01.json ├── cogvideox_Fun_180_orbit_02.json ├── cogvideox_Fun_I2V_02.json ├── cogvideox_Fun_I2V_Tora.json ├── cogvideox_Fun_pose_02.json ├── cut_and_drag_for_noisewarp_01.json └── noise_warp_example_input_video.mp4 ├── fp8_optimization.py ├── lora_utils.py ├── model_loading.py ├── mz_enable_vae_encode_tiling.py ├── mz_gguf_loader.py ├── nodes.py ├── pipeline_cogvideox.py ├── pyproject.toml ├── readme.md ├── requirements.txt ├── tora ├── traj_module.py └── traj_utils.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [kijai] 2 | custom: ["https://www.paypal.me/kijaidesign"] 3 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Comfy registry 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | paths: 9 | - "pyproject.toml" 10 | 11 | jobs: 12 | publish-node: 13 | name: Publish Custom Node to registry 14 | runs-on: ubuntu-latest 15 | # if this is a forked repository. Skipping the workflow. 16 | if: github.event.repository.fork == false 17 | steps: 18 | - name: Check out code 19 | uses: actions/checkout@v4 20 | - name: Publish Custom Node 21 | uses: Comfy-Org/publish-node-action@main 22 | with: 23 | ## Add your own personal access token to your Github Repository secrets and reference it here. 24 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | *__pycache__/ 3 | samples*/ 4 | runs/ 5 | checkpoints/ 6 | master_ip 7 | logs/ 8 | *.DS_Store 9 | .idea 10 | *.pt 11 | tools/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .nodes import NODE_CLASS_MAPPINGS as NODES_CLASS, NODE_DISPLAY_NAME_MAPPINGS as NODES_DISPLAY 2 | from .model_loading import NODE_CLASS_MAPPINGS as MODEL_CLASS, NODE_DISPLAY_NAME_MAPPINGS as MODEL_DISPLAY 3 | 4 | NODE_CLASS_MAPPINGS = {**NODES_CLASS, **MODEL_CLASS} 5 | NODE_DISPLAY_NAME_MAPPINGS = {**NODES_DISPLAY, **MODEL_DISPLAY} 6 | 7 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] -------------------------------------------------------------------------------- /cogvideo_controlnet.py: -------------------------------------------------------------------------------- 1 | # https://github.com/TheDenk/cogvideox-controlnet/blob/main/cogvideo_controlnet.py 2 | from typing import Any, Dict, Optional, Tuple, Union 3 | 4 | import torch 5 | from torch import nn 6 | from einops import rearrange 7 | import torch.nn.functional as F 8 | from .custom_cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock 9 | from diffusers.utils import is_torch_version 10 | from diffusers.loaders import PeftAdapterMixin 11 | from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps 12 | from diffusers.models.modeling_utils import ModelMixin 13 | from diffusers.configuration_utils import ConfigMixin, register_to_config 14 | 15 | 16 | class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin): 17 | _supports_gradient_checkpointing = True 18 | 19 | @register_to_config 20 | def __init__( 21 | self, 22 | num_attention_heads: int = 30, 23 | attention_head_dim: int = 64, 24 | vae_channels: int = 16, 25 | in_channels: int = 3, 26 | downscale_coef: int = 8, 27 | flip_sin_to_cos: bool = True, 28 | freq_shift: int = 0, 29 | time_embed_dim: int = 512, 30 | num_layers: int = 8, 31 | dropout: float = 0.0, 32 | attention_bias: bool = True, 33 | sample_width: int = 90, 34 | sample_height: int = 60, 35 | sample_frames: int = 49, 36 | patch_size: int = 2, 37 | temporal_compression_ratio: int = 4, 38 | max_text_seq_length: int = 226, 39 | activation_fn: str = "gelu-approximate", 40 | timestep_activation_fn: str = "silu", 41 | norm_elementwise_affine: bool = True, 42 | norm_eps: float = 1e-5, 43 | spatial_interpolation_scale: float = 1.875, 44 | temporal_interpolation_scale: float = 1.0, 45 | use_rotary_positional_embeddings: bool = False, 46 | use_learned_positional_embeddings: bool = False, 47 | out_proj_dim = None, 48 | ): 49 | super().__init__() 50 | inner_dim = num_attention_heads * attention_head_dim 51 | 52 | if not use_rotary_positional_embeddings and use_learned_positional_embeddings: 53 | raise ValueError( 54 | "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional " 55 | "embeddings. If you're using a custom model and/or believe this should be supported, please open an " 56 | "issue at https://github.com/huggingface/diffusers/issues." 57 | ) 58 | 59 | start_channels = in_channels * (downscale_coef ** 2) 60 | input_channels = [start_channels, start_channels // 2, start_channels // 4] 61 | self.unshuffle = nn.PixelUnshuffle(downscale_coef) 62 | 63 | self.controlnet_encode_first = nn.Sequential( 64 | nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0), 65 | nn.GroupNorm(2, input_channels[1]), 66 | nn.ReLU(), 67 | ) 68 | 69 | self.controlnet_encode_second = nn.Sequential( 70 | nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0), 71 | nn.GroupNorm(2, input_channels[2]), 72 | nn.ReLU(), 73 | ) 74 | 75 | # 1. Patch embedding 76 | self.patch_embed = CogVideoXPatchEmbed( 77 | patch_size=patch_size, 78 | in_channels=vae_channels + input_channels[2], 79 | embed_dim=inner_dim, 80 | bias=True, 81 | sample_width=sample_width, 82 | sample_height=sample_height, 83 | sample_frames=sample_frames, 84 | temporal_compression_ratio=temporal_compression_ratio, 85 | spatial_interpolation_scale=spatial_interpolation_scale, 86 | temporal_interpolation_scale=temporal_interpolation_scale, 87 | use_positional_embeddings=not use_rotary_positional_embeddings, 88 | use_learned_positional_embeddings=use_learned_positional_embeddings, 89 | ) 90 | 91 | self.embedding_dropout = nn.Dropout(dropout) 92 | 93 | # 2. Time embeddings 94 | self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift) 95 | self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn) 96 | 97 | # 3. Define spatio-temporal transformers blocks 98 | self.transformer_blocks = nn.ModuleList( 99 | [ 100 | CogVideoXBlock( 101 | dim=inner_dim, 102 | num_attention_heads=num_attention_heads, 103 | attention_head_dim=attention_head_dim, 104 | time_embed_dim=time_embed_dim, 105 | dropout=dropout, 106 | activation_fn=activation_fn, 107 | attention_bias=attention_bias, 108 | norm_elementwise_affine=norm_elementwise_affine, 109 | norm_eps=norm_eps, 110 | ) 111 | for _ in range(num_layers) 112 | ] 113 | ) 114 | 115 | self.out_projectors = None 116 | if out_proj_dim is not None: 117 | self.out_projectors = nn.ModuleList( 118 | [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)] 119 | ) 120 | 121 | self.gradient_checkpointing = False 122 | 123 | def _set_gradient_checkpointing(self, module, value=False): 124 | self.gradient_checkpointing = value 125 | 126 | def compress_time(self, x, num_frames): 127 | x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames) 128 | batch_size, frames, channels, height, width = x.shape 129 | x = rearrange(x, 'b f c h w -> (b h w) c f') 130 | 131 | if x.shape[-1] % 2 == 1: 132 | x_first, x_rest = x[..., 0], x[..., 1:] 133 | if x_rest.shape[-1] > 0: 134 | x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2) 135 | 136 | x = torch.cat([x_first[..., None], x_rest], dim=-1) 137 | else: 138 | x = F.avg_pool1d(x, kernel_size=2, stride=2) 139 | x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width) 140 | return x 141 | 142 | def forward( 143 | self, 144 | hidden_states: torch.Tensor, 145 | encoder_hidden_states: torch.Tensor, 146 | controlnet_states: torch.Tensor, 147 | timestep: Union[int, float, torch.LongTensor], 148 | image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, 149 | timestep_cond: Optional[torch.Tensor] = None, 150 | return_dict: bool = True, 151 | ): 152 | batch_size, num_frames, channels, height, width = controlnet_states.shape 153 | # 0. Controlnet encoder 154 | controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w') 155 | controlnet_states = self.unshuffle(controlnet_states) 156 | controlnet_states = self.controlnet_encode_first(controlnet_states) 157 | controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 158 | num_frames = controlnet_states.shape[0] // batch_size 159 | 160 | controlnet_states = self.controlnet_encode_second(controlnet_states) 161 | controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 162 | controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size) 163 | 164 | hidden_states = torch.cat([hidden_states, controlnet_states], dim=2) 165 | # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep) 166 | # 1. Time embedding 167 | timesteps = timestep 168 | t_emb = self.time_proj(timesteps) 169 | 170 | # timesteps does not contain any weights and will always return f32 tensors 171 | # but time_embedding might actually be running in fp16. so we need to cast here. 172 | # there might be better ways to encapsulate this. 173 | t_emb = t_emb.to(dtype=hidden_states.dtype) 174 | emb = self.time_embedding(t_emb, timestep_cond) 175 | 176 | hidden_states = self.patch_embed(encoder_hidden_states, hidden_states) 177 | hidden_states = self.embedding_dropout(hidden_states) 178 | 179 | 180 | text_seq_length = encoder_hidden_states.shape[1] 181 | encoder_hidden_states = hidden_states[:, :text_seq_length] 182 | hidden_states = hidden_states[:, text_seq_length:] 183 | 184 | 185 | controlnet_hidden_states = () 186 | # 3. Transformer blocks 187 | for i, block in enumerate(self.transformer_blocks): 188 | if self.training and self.gradient_checkpointing: 189 | 190 | def create_custom_forward(module): 191 | def custom_forward(*inputs): 192 | return module(*inputs) 193 | 194 | return custom_forward 195 | 196 | ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} 197 | hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint( 198 | create_custom_forward(block), 199 | hidden_states, 200 | encoder_hidden_states, 201 | emb, 202 | image_rotary_emb, 203 | **ckpt_kwargs, 204 | ) 205 | else: 206 | hidden_states, encoder_hidden_states = block( 207 | hidden_states=hidden_states, 208 | encoder_hidden_states=encoder_hidden_states, 209 | temb=emb, 210 | image_rotary_emb=image_rotary_emb, 211 | ) 212 | 213 | if self.out_projectors is not None: 214 | controlnet_hidden_states += (self.out_projectors[i](hidden_states),) 215 | else: 216 | controlnet_hidden_states += (hidden_states,) 217 | 218 | if not return_dict: 219 | return (controlnet_hidden_states,) 220 | return Transformer2DModelOutput(sample=controlnet_hidden_states) -------------------------------------------------------------------------------- /cogvideox_fun/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | ASPECT_RATIO_512 = { 5 | '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0], 6 | '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0], 7 | '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0], 8 | '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0], 9 | '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0], 10 | '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0], 11 | '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0], 12 | '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0], 13 | '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0], 14 | '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0] 15 | } 16 | ASPECT_RATIO_RANDOM_CROP_512 = { 17 | '0.42': [320.0, 768.0], '0.5': [352.0, 704.0], 18 | '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0], 19 | '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], 20 | '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0], 21 | '2.0': [704.0, 352.0], '2.4': [768.0, 320.0] 22 | } 23 | ASPECT_RATIO_RANDOM_CROP_PROB = [ 24 | 1, 2, 25 | 4, 4, 4, 4, 26 | 8, 8, 8, 27 | 4, 4, 4, 4, 28 | 2, 1 29 | ] 30 | ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB) 31 | 32 | def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512): 33 | aspect_ratio = height / width 34 | closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio)) 35 | return ratios[closest_ratio], float(closest_ratio) 36 | 37 | def get_width_and_height_from_image_and_base_resolution(image, base_resolution): 38 | target_pixels = int(base_resolution) * int(base_resolution) 39 | original_width, original_height = Image.open(image).size 40 | ratio = (target_pixels / (original_width * original_height)) ** 0.5 41 | width_slider = round(original_width * ratio) 42 | height_slider = round(original_height * ratio) 43 | return height_slider, width_slider -------------------------------------------------------------------------------- /configs/scheduler_config_2b.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "CogVideoXDDIMScheduler", 3 | "_diffusers_version": "0.30.0.dev0", 4 | "beta_end": 0.012, 5 | "beta_schedule": "scaled_linear", 6 | "beta_start": 0.00085, 7 | "clip_sample": false, 8 | "clip_sample_range": 1.0, 9 | "num_train_timesteps": 1000, 10 | "prediction_type": "v_prediction", 11 | "rescale_betas_zero_snr": true, 12 | "sample_max_value": 1.0, 13 | "set_alpha_to_one": true, 14 | "snr_shift_scale": 3.0, 15 | "steps_offset": 0, 16 | "timestep_spacing": "trailing", 17 | "trained_betas": null 18 | } 19 | -------------------------------------------------------------------------------- /configs/scheduler_config_5b.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "CogVideoXDDIMScheduler", 3 | "_diffusers_version": "0.31.0.dev0", 4 | "beta_end": 0.012, 5 | "beta_schedule": "scaled_linear", 6 | "beta_start": 0.00085, 7 | "clip_sample": false, 8 | "clip_sample_range": 1.0, 9 | "num_train_timesteps": 1000, 10 | "prediction_type": "v_prediction", 11 | "rescale_betas_zero_snr": true, 12 | "sample_max_value": 1.0, 13 | "set_alpha_to_one": true, 14 | "snr_shift_scale": 1.0, 15 | "steps_offset": 0, 16 | "timestep_spacing": "trailing", 17 | "trained_betas": null 18 | } -------------------------------------------------------------------------------- /configs/transformer_config_2b.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_fn": "gelu-approximate", 3 | "attention_bias": true, 4 | "attention_head_dim": 64, 5 | "dropout": 0.0, 6 | "flip_sin_to_cos": true, 7 | "freq_shift": 0, 8 | "in_channels": 16, 9 | "max_text_seq_length": 226, 10 | "norm_elementwise_affine": true, 11 | "norm_eps": 1e-05, 12 | "num_attention_heads": 30, 13 | "num_layers": 30, 14 | "out_channels": 16, 15 | "patch_size": 2, 16 | "sample_frames": 49, 17 | "sample_height": 60, 18 | "sample_width": 90, 19 | "spatial_interpolation_scale": 1.875, 20 | "temporal_compression_ratio": 4, 21 | "temporal_interpolation_scale": 1.0, 22 | "text_embed_dim": 4096, 23 | "time_embed_dim": 512, 24 | "timestep_activation_fn": "silu", 25 | "use_rotary_positional_embeddings": false 26 | } -------------------------------------------------------------------------------- /configs/transformer_config_5b.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_fn": "gelu-approximate", 3 | "attention_bias": true, 4 | "attention_head_dim": 64, 5 | "dropout": 0.0, 6 | "flip_sin_to_cos": true, 7 | "freq_shift": 0, 8 | "in_channels": 16, 9 | "max_text_seq_length": 226, 10 | "norm_elementwise_affine": true, 11 | "norm_eps": 1e-05, 12 | "num_attention_heads": 48, 13 | "num_layers": 42, 14 | "out_channels": 16, 15 | "patch_size": 2, 16 | "sample_frames": 49, 17 | "sample_height": 60, 18 | "sample_width": 90, 19 | "spatial_interpolation_scale": 1.875, 20 | "temporal_compression_ratio": 4, 21 | "temporal_interpolation_scale": 1.0, 22 | "text_embed_dim": 4096, 23 | "time_embed_dim": 512, 24 | "timestep_activation_fn": "silu", 25 | "use_rotary_positional_embeddings": true 26 | } -------------------------------------------------------------------------------- /configs/transformer_config_I2V_5b.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_fn": "gelu-approximate", 3 | "attention_bias": true, 4 | "attention_head_dim": 64, 5 | "dropout": 0.0, 6 | "flip_sin_to_cos": true, 7 | "freq_shift": 0, 8 | "in_channels": 32, 9 | "max_text_seq_length": 226, 10 | "norm_elementwise_affine": true, 11 | "norm_eps": 1e-05, 12 | "num_attention_heads": 48, 13 | "num_layers": 42, 14 | "out_channels": 16, 15 | "patch_size": 2, 16 | "sample_frames": 49, 17 | "sample_height": 60, 18 | "sample_width": 90, 19 | "spatial_interpolation_scale": 1.875, 20 | "temporal_compression_ratio": 4, 21 | "temporal_interpolation_scale": 1.0, 22 | "text_embed_dim": 4096, 23 | "time_embed_dim": 512, 24 | "timestep_activation_fn": "silu", 25 | "use_learned_positional_embeddings": true, 26 | "use_rotary_positional_embeddings": true 27 | } -------------------------------------------------------------------------------- /configs/vae_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "AutoencoderKLCogVideoX", 3 | "_diffusers_version": "0.31.0.dev0", 4 | "act_fn": "silu", 5 | "block_out_channels": [ 6 | 128, 7 | 256, 8 | 256, 9 | 512 10 | ], 11 | "down_block_types": [ 12 | "CogVideoXDownBlock3D", 13 | "CogVideoXDownBlock3D", 14 | "CogVideoXDownBlock3D", 15 | "CogVideoXDownBlock3D" 16 | ], 17 | "force_upcast": true, 18 | "in_channels": 3, 19 | "latent_channels": 16, 20 | "latents_mean": null, 21 | "latents_std": null, 22 | "layers_per_block": 3, 23 | "norm_eps": 1e-06, 24 | "norm_num_groups": 32, 25 | "out_channels": 3, 26 | "sample_height": 480, 27 | "sample_width": 720, 28 | "scaling_factor": 0.7, 29 | "shift_factor": null, 30 | "temporal_compression_ratio": 4, 31 | "up_block_types": [ 32 | "CogVideoXUpBlock3D", 33 | "CogVideoXUpBlock3D", 34 | "CogVideoXUpBlock3D", 35 | "CogVideoXUpBlock3D" 36 | ], 37 | "use_post_quant_conv": false, 38 | "use_quant_conv": false 39 | } 40 | -------------------------------------------------------------------------------- /context.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Callable, Optional, List 3 | 4 | 5 | def ordered_halving(val): 6 | bin_str = f"{val:064b}" 7 | bin_flip = bin_str[::-1] 8 | as_int = int(bin_flip, 2) 9 | 10 | return as_int / (1 << 64) 11 | 12 | def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]: 13 | prev_val = -1 14 | for i, val in enumerate(window): 15 | val = val % num_frames 16 | if val < prev_val: 17 | return True, i 18 | prev_val = val 19 | return False, -1 20 | 21 | def shift_window_to_start(window: list[int], num_frames: int): 22 | start_val = window[0] 23 | for i in range(len(window)): 24 | # 1) subtract each element by start_val to move vals relative to the start of all frames 25 | # 2) add num_frames and take modulus to get adjusted vals 26 | window[i] = ((window[i] - start_val) + num_frames) % num_frames 27 | 28 | def shift_window_to_end(window: list[int], num_frames: int): 29 | # 1) shift window to start 30 | shift_window_to_start(window, num_frames) 31 | end_val = window[-1] 32 | end_delta = num_frames - end_val - 1 33 | for i in range(len(window)): 34 | # 2) add end_delta to each val to slide windows to end 35 | window[i] = window[i] + end_delta 36 | 37 | def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]: 38 | all_indexes = list(range(num_frames)) 39 | for w in windows: 40 | for val in w: 41 | try: 42 | all_indexes.remove(val) 43 | except ValueError: 44 | pass 45 | return all_indexes 46 | 47 | def uniform_looped( 48 | step: int = ..., 49 | num_steps: Optional[int] = None, 50 | num_frames: int = ..., 51 | context_size: Optional[int] = None, 52 | context_stride: int = 3, 53 | context_overlap: int = 4, 54 | closed_loop: bool = True, 55 | ): 56 | if num_frames <= context_size: 57 | yield list(range(num_frames)) 58 | return 59 | 60 | context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1) 61 | 62 | for context_step in 1 << np.arange(context_stride): 63 | pad = int(round(num_frames * ordered_halving(step))) 64 | for j in range( 65 | int(ordered_halving(step) * context_step) + pad, 66 | num_frames + pad + (0 if closed_loop else -context_overlap), 67 | (context_size * context_step - context_overlap), 68 | ): 69 | yield [e % num_frames for e in range(j, j + context_size * context_step, context_step)] 70 | 71 | #from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved) 72 | def uniform_standard( 73 | step: int = ..., 74 | num_steps: Optional[int] = None, 75 | num_frames: int = ..., 76 | context_size: Optional[int] = None, 77 | context_stride: int = 3, 78 | context_overlap: int = 4, 79 | closed_loop: bool = True, 80 | ): 81 | windows = [] 82 | if num_frames <= context_size: 83 | windows.append(list(range(num_frames))) 84 | return windows 85 | 86 | context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1) 87 | 88 | for context_step in 1 << np.arange(context_stride): 89 | pad = int(round(num_frames * ordered_halving(step))) 90 | for j in range( 91 | int(ordered_halving(step) * context_step) + pad, 92 | num_frames + pad + (0 if closed_loop else -context_overlap), 93 | (context_size * context_step - context_overlap), 94 | ): 95 | windows.append([e % num_frames for e in range(j, j + context_size * context_step, context_step)]) 96 | 97 | # now that windows are created, shift any windows that loop, and delete duplicate windows 98 | delete_idxs = [] 99 | win_i = 0 100 | while win_i < len(windows): 101 | # if window is rolls over itself, need to shift it 102 | is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames) 103 | if is_roll: 104 | roll_val = windows[win_i][roll_idx] # roll_val might not be 0 for windows of higher strides 105 | shift_window_to_end(windows[win_i], num_frames=num_frames) 106 | # check if next window (cyclical) is missing roll_val 107 | if roll_val not in windows[(win_i+1) % len(windows)]: 108 | # need to insert new window here - just insert window starting at roll_val 109 | windows.insert(win_i+1, list(range(roll_val, roll_val + context_size))) 110 | # delete window if it's not unique 111 | for pre_i in range(0, win_i): 112 | if windows[win_i] == windows[pre_i]: 113 | delete_idxs.append(win_i) 114 | break 115 | win_i += 1 116 | 117 | # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation 118 | delete_idxs.reverse() 119 | for i in delete_idxs: 120 | windows.pop(i) 121 | return windows 122 | 123 | def static_standard( 124 | step: int = ..., 125 | num_steps: Optional[int] = None, 126 | num_frames: int = ..., 127 | context_size: Optional[int] = None, 128 | context_stride: int = 3, 129 | context_overlap: int = 4, 130 | closed_loop: bool = True, 131 | ): 132 | windows = [] 133 | if num_frames <= context_size: 134 | windows.append(list(range(num_frames))) 135 | return windows 136 | # always return the same set of windows 137 | delta = context_size - context_overlap 138 | for start_idx in range(0, num_frames, delta): 139 | # if past the end of frames, move start_idx back to allow same context_length 140 | ending = start_idx + context_size 141 | if ending >= num_frames: 142 | final_delta = ending - num_frames 143 | final_start_idx = start_idx - final_delta 144 | windows.append(list(range(final_start_idx, final_start_idx + context_size))) 145 | break 146 | windows.append(list(range(start_idx, start_idx + context_size))) 147 | return windows 148 | 149 | def get_context_scheduler(name: str) -> Callable: 150 | if name == "uniform_looped": 151 | return uniform_looped 152 | elif name == "uniform_standard": 153 | return uniform_standard 154 | elif name == "static_standard": 155 | return static_standard 156 | else: 157 | raise ValueError(f"Unknown context_overlap policy {name}") 158 | 159 | 160 | def get_total_steps( 161 | scheduler, 162 | timesteps: List[int], 163 | num_steps: Optional[int] = None, 164 | num_frames: int = ..., 165 | context_size: Optional[int] = None, 166 | context_stride: int = 3, 167 | context_overlap: int = 4, 168 | closed_loop: bool = True, 169 | ): 170 | return sum( 171 | len( 172 | list( 173 | scheduler( 174 | i, 175 | num_steps, 176 | num_frames, 177 | context_size, 178 | context_stride, 179 | context_overlap, 180 | ) 181 | ) 182 | ) 183 | for i in range(len(timesteps)) 184 | ) 185 | -------------------------------------------------------------------------------- /embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from typing import Tuple, Union, Optional 5 | from diffusers.models.embeddings import get_3d_sincos_pos_embed, get_1d_rotary_pos_embed 6 | 7 | 8 | class CogVideoXPatchEmbed(nn.Module): 9 | def __init__( 10 | self, 11 | patch_size: int = 2, 12 | patch_size_t: Optional[int] = None, 13 | in_channels: int = 16, 14 | embed_dim: int = 1920, 15 | text_embed_dim: int = 4096, 16 | bias: bool = True, 17 | sample_width: int = 90, 18 | sample_height: int = 60, 19 | sample_frames: int = 49, 20 | temporal_compression_ratio: int = 4, 21 | max_text_seq_length: int = 226, 22 | spatial_interpolation_scale: float = 1.875, 23 | temporal_interpolation_scale: float = 1.0, 24 | use_positional_embeddings: bool = True, 25 | use_learned_positional_embeddings: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | 29 | self.patch_size = patch_size 30 | self.patch_size_t = patch_size_t 31 | self.embed_dim = embed_dim 32 | self.sample_height = sample_height 33 | self.sample_width = sample_width 34 | self.sample_frames = sample_frames 35 | self.temporal_compression_ratio = temporal_compression_ratio 36 | self.max_text_seq_length = max_text_seq_length 37 | self.spatial_interpolation_scale = spatial_interpolation_scale 38 | self.temporal_interpolation_scale = temporal_interpolation_scale 39 | self.use_positional_embeddings = use_positional_embeddings 40 | self.use_learned_positional_embeddings = use_learned_positional_embeddings 41 | 42 | if patch_size_t is None: 43 | # CogVideoX 1.0 checkpoints 44 | self.proj = nn.Conv2d( 45 | in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias 46 | ) 47 | else: 48 | # CogVideoX 1.5 checkpoints 49 | self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim) 50 | 51 | self.text_proj = nn.Linear(text_embed_dim, embed_dim) 52 | 53 | if use_positional_embeddings or use_learned_positional_embeddings: 54 | persistent = use_learned_positional_embeddings 55 | pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames) 56 | self.register_buffer("pos_embedding", pos_embedding, persistent=persistent) 57 | 58 | def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor: 59 | post_patch_height = sample_height // self.patch_size 60 | post_patch_width = sample_width // self.patch_size 61 | post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1 62 | num_patches = post_patch_height * post_patch_width * post_time_compression_frames 63 | 64 | pos_embedding = get_3d_sincos_pos_embed( 65 | self.embed_dim, 66 | (post_patch_width, post_patch_height), 67 | post_time_compression_frames, 68 | self.spatial_interpolation_scale, 69 | self.temporal_interpolation_scale, 70 | ) 71 | pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1) 72 | joint_pos_embedding = torch.zeros( 73 | 1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False 74 | ) 75 | joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding) 76 | 77 | return joint_pos_embedding 78 | 79 | def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): 80 | r""" 81 | Args: 82 | text_embeds (`torch.Tensor`): 83 | Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim). 84 | image_embeds (`torch.Tensor`): 85 | Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width). 86 | """ 87 | text_embeds = self.text_proj(text_embeds) 88 | 89 | batch_size, num_frames, channels, height, width = image_embeds.shape 90 | 91 | if self.patch_size_t is None: 92 | image_embeds = image_embeds.reshape(-1, channels, height, width) 93 | image_embeds = self.proj(image_embeds) 94 | image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:]) 95 | image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels] 96 | image_embeds = image_embeds.flatten(1, 2) # [batch, num_frames x height x width, channels] 97 | else: 98 | p = self.patch_size 99 | p_t = self.patch_size_t 100 | 101 | image_embeds = image_embeds.permute(0, 1, 3, 4, 2) 102 | image_embeds = image_embeds.reshape( 103 | batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels 104 | ) 105 | image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3) 106 | image_embeds = self.proj(image_embeds) 107 | 108 | embeds = torch.cat( 109 | [text_embeds, image_embeds], dim=1 110 | ).contiguous() # [batch, seq_length + num_frames x height x width, channels] 111 | 112 | if self.use_positional_embeddings or self.use_learned_positional_embeddings: 113 | if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height): 114 | raise ValueError( 115 | "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'." 116 | "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues." 117 | ) 118 | 119 | pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 120 | 121 | if ( 122 | self.sample_height != height 123 | or self.sample_width != width 124 | or self.sample_frames != pre_time_compression_frames 125 | ): 126 | pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames) 127 | pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype) 128 | else: 129 | pos_embedding = self.pos_embedding 130 | 131 | embeds = embeds + pos_embedding 132 | 133 | return embeds 134 | 135 | def get_3d_rotary_pos_embed( 136 | embed_dim, 137 | crops_coords, 138 | grid_size, 139 | temporal_size, 140 | theta: int = 10000, 141 | use_real: bool = True, 142 | grid_type: str = "linspace", 143 | max_size: Optional[Tuple[int, int]] = None, 144 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 145 | """ 146 | RoPE for video tokens with 3D structure. 147 | 148 | Args: 149 | embed_dim: (`int`): 150 | The embedding dimension size, corresponding to hidden_size_head. 151 | crops_coords (`Tuple[int]`): 152 | The top-left and bottom-right coordinates of the crop. 153 | grid_size (`Tuple[int]`): 154 | The grid size of the spatial positional embedding (height, width). 155 | temporal_size (`int`): 156 | The size of the temporal dimension. 157 | theta (`float`): 158 | Scaling factor for frequency computation. 159 | grid_type (`str`): 160 | Whether to use "linspace" or "slice" to compute grids. 161 | 162 | Returns: 163 | `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`. 164 | """ 165 | if use_real is not True: 166 | raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed") 167 | 168 | if grid_type == "linspace": 169 | start, stop = crops_coords 170 | grid_size_h, grid_size_w = grid_size 171 | grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32) 172 | grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32) 173 | grid_t = np.arange(temporal_size, dtype=np.float32) 174 | grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32) 175 | elif grid_type == "slice": 176 | max_h, max_w = max_size 177 | grid_size_h, grid_size_w = grid_size 178 | grid_h = np.arange(max_h, dtype=np.float32) 179 | grid_w = np.arange(max_w, dtype=np.float32) 180 | grid_t = np.arange(temporal_size, dtype=np.float32) 181 | else: 182 | raise ValueError("Invalid value passed for `grid_type`.") 183 | 184 | # Compute dimensions for each axis 185 | dim_t = embed_dim // 4 186 | dim_h = embed_dim // 8 * 3 187 | dim_w = embed_dim // 8 * 3 188 | 189 | # Temporal frequencies 190 | freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True) 191 | # Spatial frequencies for height and width 192 | freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True) 193 | freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True) 194 | 195 | # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor 196 | def combine_time_height_width(freqs_t, freqs_h, freqs_w): 197 | freqs_t = freqs_t[:, None, None, :].expand( 198 | -1, grid_size_h, grid_size_w, -1 199 | ) # temporal_size, grid_size_h, grid_size_w, dim_t 200 | freqs_h = freqs_h[None, :, None, :].expand( 201 | temporal_size, -1, grid_size_w, -1 202 | ) # temporal_size, grid_size_h, grid_size_2, dim_h 203 | freqs_w = freqs_w[None, None, :, :].expand( 204 | temporal_size, grid_size_h, -1, -1 205 | ) # temporal_size, grid_size_h, grid_size_2, dim_w 206 | 207 | freqs = torch.cat( 208 | [freqs_t, freqs_h, freqs_w], dim=-1 209 | ) # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w) 210 | freqs = freqs.view( 211 | temporal_size * grid_size_h * grid_size_w, -1 212 | ) # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w) 213 | return freqs 214 | 215 | t_cos, t_sin = freqs_t # both t_cos and t_sin has shape: temporal_size, dim_t 216 | h_cos, h_sin = freqs_h # both h_cos and h_sin has shape: grid_size_h, dim_h 217 | w_cos, w_sin = freqs_w # both w_cos and w_sin has shape: grid_size_w, dim_w 218 | 219 | if grid_type == "slice": 220 | t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size] 221 | h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h] 222 | w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w] 223 | 224 | cos = combine_time_height_width(t_cos, h_cos, w_cos) 225 | sin = combine_time_height_width(t_sin, h_sin, w_sin) 226 | return cos, sin -------------------------------------------------------------------------------- /enhance_a_video/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kijai/ComfyUI-CogVideoXWrapper/dbc63f622dd095391335612d0c7d7bbff8745cc8/enhance_a_video/__init__.py -------------------------------------------------------------------------------- /enhance_a_video/enhance.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange 3 | from diffusers.models.attention import Attention 4 | from .globals import get_enhance_weight, get_num_frames 5 | 6 | # def get_feta_scores(query, key): 7 | # img_q, img_k = query, key 8 | 9 | # num_frames = get_num_frames() 10 | 11 | # B, S, N, C = img_q.shape 12 | 13 | # # Calculate spatial dimension 14 | # spatial_dim = S // num_frames 15 | 16 | # # Add time dimension between spatial and head dims 17 | # query_image = img_q.reshape(B, spatial_dim, num_frames, N, C) 18 | # key_image = img_k.reshape(B, spatial_dim, num_frames, N, C) 19 | 20 | # # Expand time dimension 21 | # query_image = query_image.expand(-1, -1, num_frames, -1, -1) # [B, S, T, N, C] 22 | # key_image = key_image.expand(-1, -1, num_frames, -1, -1) # [B, S, T, N, C] 23 | 24 | # # Reshape to match feta_score input format: [(B S) N T C] 25 | # query_image = rearrange(query_image, "b s t n c -> (b s) n t c") #torch.Size([3200, 24, 5, 128]) 26 | # key_image = rearrange(key_image, "b s t n c -> (b s) n t c") 27 | 28 | # return feta_score(query_image, key_image, C, num_frames) 29 | 30 | def get_feta_scores( 31 | attn: Attention, 32 | query: torch.Tensor, 33 | key: torch.Tensor, 34 | head_dim: int, 35 | text_seq_length: int, 36 | ) -> torch.Tensor: 37 | num_frames = get_num_frames() 38 | spatial_dim = int((query.shape[2] - text_seq_length) / num_frames) 39 | 40 | query_image = rearrange( 41 | query[:, :, text_seq_length:], 42 | "B N (T S) C -> (B S) N T C", 43 | N=attn.heads, 44 | T=num_frames, 45 | S=spatial_dim, 46 | C=head_dim, 47 | ) 48 | key_image = rearrange( 49 | key[:, :, text_seq_length:], 50 | "B N (T S) C -> (B S) N T C", 51 | N=attn.heads, 52 | T=num_frames, 53 | S=spatial_dim, 54 | C=head_dim, 55 | ) 56 | return feta_score(query_image, key_image, head_dim, num_frames) 57 | 58 | def feta_score(query_image, key_image, head_dim, num_frames): 59 | scale = head_dim**-0.5 60 | query_image = query_image * scale 61 | attn_temp = query_image @ key_image.transpose(-2, -1) # translate attn to float32 62 | attn_temp = attn_temp.to(torch.float32) 63 | attn_temp = attn_temp.softmax(dim=-1) 64 | 65 | # Reshape to [batch_size * num_tokens, num_frames, num_frames] 66 | attn_temp = attn_temp.reshape(-1, num_frames, num_frames) 67 | 68 | # Create a mask for diagonal elements 69 | diag_mask = torch.eye(num_frames, device=attn_temp.device).bool() 70 | diag_mask = diag_mask.unsqueeze(0).expand(attn_temp.shape[0], -1, -1) 71 | 72 | # Zero out diagonal elements 73 | attn_wo_diag = attn_temp.masked_fill(diag_mask, 0) 74 | 75 | # Calculate mean for each token's attention matrix 76 | # Number of off-diagonal elements per matrix is n*n - n 77 | num_off_diag = num_frames * num_frames - num_frames 78 | mean_scores = attn_wo_diag.sum(dim=(1, 2)) / num_off_diag 79 | 80 | enhance_scores = mean_scores.mean() * (num_frames + get_enhance_weight()) 81 | enhance_scores = enhance_scores.clamp(min=1) 82 | return enhance_scores 83 | -------------------------------------------------------------------------------- /enhance_a_video/globals.py: -------------------------------------------------------------------------------- 1 | NUM_FRAMES = None 2 | FETA_WEIGHT = None 3 | ENABLE_FETA = False 4 | 5 | def set_num_frames(num_frames: int): 6 | global NUM_FRAMES 7 | NUM_FRAMES = num_frames 8 | 9 | 10 | def get_num_frames() -> int: 11 | return NUM_FRAMES 12 | 13 | 14 | def enable_enhance(): 15 | global ENABLE_FETA 16 | ENABLE_FETA = True 17 | 18 | def disable_enhance(): 19 | global ENABLE_FETA 20 | ENABLE_FETA = False 21 | 22 | def is_enhance_enabled() -> bool: 23 | return ENABLE_FETA 24 | 25 | def set_enhance_weight(feta_weight: float): 26 | global FETA_WEIGHT 27 | FETA_WEIGHT = feta_weight 28 | 29 | 30 | def get_enhance_weight() -> float: 31 | return FETA_WEIGHT 32 | -------------------------------------------------------------------------------- /example_workflows/cogvideox_1_0_2b_controlnet_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 48, 3 | "last_link_id": 90, 4 | "nodes": [ 5 | { 6 | "id": 41, 7 | "type": "HEDPreprocessor", 8 | "pos": { 9 | "0": -570, 10 | "1": -76 11 | }, 12 | "size": { 13 | "0": 315, 14 | "1": 82 15 | }, 16 | "flags": {}, 17 | "order": 4, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "image", 22 | "type": "IMAGE", 23 | "link": 73 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "IMAGE", 29 | "type": "IMAGE", 30 | "links": [ 31 | 74 32 | ], 33 | "slot_index": 0 34 | } 35 | ], 36 | "properties": { 37 | "Node name for S&R": "HEDPreprocessor" 38 | }, 39 | "widgets_values": [ 40 | "enable", 41 | 768 42 | ] 43 | }, 44 | { 45 | "id": 38, 46 | "type": "VHS_LoadVideo", 47 | "pos": { 48 | "0": -847, 49 | "1": -78 50 | }, 51 | "size": [ 52 | 247.455078125, 53 | 427.63671875 54 | ], 55 | "flags": {}, 56 | "order": 0, 57 | "mode": 0, 58 | "inputs": [ 59 | { 60 | "name": "meta_batch", 61 | "type": "VHS_BatchManager", 62 | "link": null, 63 | "shape": 7 64 | }, 65 | { 66 | "name": "vae", 67 | "type": "VAE", 68 | "link": null, 69 | "shape": 7 70 | } 71 | ], 72 | "outputs": [ 73 | { 74 | "name": "IMAGE", 75 | "type": "IMAGE", 76 | "links": [ 77 | 73 78 | ], 79 | "slot_index": 0 80 | }, 81 | { 82 | "name": "frame_count", 83 | "type": "INT", 84 | "links": null 85 | }, 86 | { 87 | "name": "audio", 88 | "type": "AUDIO", 89 | "links": null 90 | }, 91 | { 92 | "name": "video_info", 93 | "type": "VHS_VIDEOINFO", 94 | "links": null 95 | } 96 | ], 97 | "properties": { 98 | "Node name for S&R": "VHS_LoadVideo" 99 | }, 100 | "widgets_values": { 101 | "video": "car.mp4", 102 | "force_rate": 0, 103 | "force_size": "Disabled", 104 | "custom_width": 512, 105 | "custom_height": 512, 106 | "frame_load_cap": 49, 107 | "skip_first_frames": 0, 108 | "select_every_nth": 1, 109 | "choose video to upload": "image", 110 | "videopreview": { 111 | "hidden": false, 112 | "paused": false, 113 | "params": { 114 | "frame_load_cap": 49, 115 | "skip_first_frames": 0, 116 | "force_rate": 0, 117 | "filename": "car.mp4", 118 | "type": "input", 119 | "format": "video/mp4", 120 | "select_every_nth": 1 121 | }, 122 | "muted": false 123 | } 124 | } 125 | }, 126 | { 127 | "id": 39, 128 | "type": "ImageResizeKJ", 129 | "pos": { 130 | "0": -563, 131 | "1": 63 132 | }, 133 | "size": { 134 | "0": 315, 135 | "1": 266 136 | }, 137 | "flags": {}, 138 | "order": 6, 139 | "mode": 0, 140 | "inputs": [ 141 | { 142 | "name": "image", 143 | "type": "IMAGE", 144 | "link": 74 145 | }, 146 | { 147 | "name": "get_image_size", 148 | "type": "IMAGE", 149 | "link": null, 150 | "shape": 7 151 | }, 152 | { 153 | "name": "width_input", 154 | "type": "INT", 155 | "link": null, 156 | "widget": { 157 | "name": "width_input" 158 | }, 159 | "shape": 7 160 | }, 161 | { 162 | "name": "height_input", 163 | "type": "INT", 164 | "link": null, 165 | "widget": { 166 | "name": "height_input" 167 | }, 168 | "shape": 7 169 | } 170 | ], 171 | "outputs": [ 172 | { 173 | "name": "IMAGE", 174 | "type": "IMAGE", 175 | "links": [ 176 | 71 177 | ], 178 | "slot_index": 0 179 | }, 180 | { 181 | "name": "width", 182 | "type": "INT", 183 | "links": null 184 | }, 185 | { 186 | "name": "height", 187 | "type": "INT", 188 | "links": null 189 | } 190 | ], 191 | "properties": { 192 | "Node name for S&R": "ImageResizeKJ" 193 | }, 194 | "widgets_values": [ 195 | 720, 196 | 480, 197 | "lanczos", 198 | false, 199 | 2, 200 | 0, 201 | 0, 202 | "disabled" 203 | ] 204 | }, 205 | { 206 | "id": 30, 207 | "type": "CogVideoTextEncode", 208 | "pos": { 209 | "0": 130, 210 | "1": 350 211 | }, 212 | "size": { 213 | "0": 475.7875061035156, 214 | "1": 231.29896545410156 215 | }, 216 | "flags": {}, 217 | "order": 5, 218 | "mode": 0, 219 | "inputs": [ 220 | { 221 | "name": "clip", 222 | "type": "CLIP", 223 | "link": 54 224 | } 225 | ], 226 | "outputs": [ 227 | { 228 | "name": "conditioning", 229 | "type": "CONDITIONING", 230 | "links": [ 231 | 84 232 | ], 233 | "slot_index": 0, 234 | "shape": 3 235 | }, 236 | { 237 | "name": "clip", 238 | "type": "CLIP", 239 | "links": [ 240 | 78 241 | ], 242 | "slot_index": 1 243 | } 244 | ], 245 | "properties": { 246 | "Node name for S&R": "CogVideoTextEncode" 247 | }, 248 | "widgets_values": [ 249 | "car is moving among mountains", 250 | 1, 251 | false 252 | ] 253 | }, 254 | { 255 | "id": 31, 256 | "type": "CogVideoTextEncode", 257 | "pos": { 258 | "0": 139, 259 | "1": 643 260 | }, 261 | "size": { 262 | "0": 463.01251220703125, 263 | "1": 144 264 | }, 265 | "flags": {}, 266 | "order": 7, 267 | "mode": 0, 268 | "inputs": [ 269 | { 270 | "name": "clip", 271 | "type": "CLIP", 272 | "link": 78 273 | } 274 | ], 275 | "outputs": [ 276 | { 277 | "name": "conditioning", 278 | "type": "CONDITIONING", 279 | "links": [ 280 | 85 281 | ], 282 | "slot_index": 0, 283 | "shape": 3 284 | }, 285 | { 286 | "name": "clip", 287 | "type": "CLIP", 288 | "links": null 289 | } 290 | ], 291 | "properties": { 292 | "Node name for S&R": "CogVideoTextEncode" 293 | }, 294 | "widgets_values": [ 295 | "", 296 | 1, 297 | true 298 | ] 299 | }, 300 | { 301 | "id": 44, 302 | "type": "DownloadAndLoadCogVideoModel", 303 | "pos": { 304 | "0": 326, 305 | "1": -319 306 | }, 307 | "size": { 308 | "0": 315, 309 | "1": 218 310 | }, 311 | "flags": {}, 312 | "order": 1, 313 | "mode": 0, 314 | "inputs": [ 315 | { 316 | "name": "block_edit", 317 | "type": "TRANSFORMERBLOCKS", 318 | "link": null, 319 | "shape": 7 320 | }, 321 | { 322 | "name": "lora", 323 | "type": "COGLORA", 324 | "link": null, 325 | "shape": 7 326 | }, 327 | { 328 | "name": "compile_args", 329 | "type": "COMPILEARGS", 330 | "link": null, 331 | "shape": 7 332 | } 333 | ], 334 | "outputs": [ 335 | { 336 | "name": "model", 337 | "type": "COGVIDEOMODEL", 338 | "links": [ 339 | 83 340 | ] 341 | }, 342 | { 343 | "name": "vae", 344 | "type": "VAE", 345 | "links": [ 346 | 82 347 | ], 348 | "slot_index": 1 349 | } 350 | ], 351 | "properties": { 352 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 353 | }, 354 | "widgets_values": [ 355 | "THUDM/CogVideoX-2b", 356 | "bf16", 357 | "disabled", 358 | false, 359 | "sdpa", 360 | "main_device" 361 | ] 362 | }, 363 | { 364 | "id": 20, 365 | "type": "CLIPLoader", 366 | "pos": { 367 | "0": -175, 368 | "1": -317 369 | }, 370 | "size": { 371 | "0": 452.912353515625, 372 | "1": 82 373 | }, 374 | "flags": {}, 375 | "order": 2, 376 | "mode": 0, 377 | "inputs": [], 378 | "outputs": [ 379 | { 380 | "name": "CLIP", 381 | "type": "CLIP", 382 | "links": [ 383 | 54 384 | ], 385 | "slot_index": 0, 386 | "shape": 3 387 | } 388 | ], 389 | "properties": { 390 | "Node name for S&R": "CLIPLoader" 391 | }, 392 | "widgets_values": [ 393 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 394 | "sd3" 395 | ] 396 | }, 397 | { 398 | "id": 35, 399 | "type": "DownloadAndLoadCogVideoControlNet", 400 | "pos": { 401 | "0": -105, 402 | "1": -182 403 | }, 404 | "size": { 405 | "0": 378, 406 | "1": 58 407 | }, 408 | "flags": {}, 409 | "order": 3, 410 | "mode": 0, 411 | "inputs": [], 412 | "outputs": [ 413 | { 414 | "name": "cogvideo_controlnet", 415 | "type": "COGVIDECONTROLNETMODEL", 416 | "links": [ 417 | 67 418 | ] 419 | } 420 | ], 421 | "properties": { 422 | "Node name for S&R": "DownloadAndLoadCogVideoControlNet" 423 | }, 424 | "widgets_values": [ 425 | "TheDenk/cogvideox-2b-controlnet-hed-v1" 426 | ] 427 | }, 428 | { 429 | "id": 37, 430 | "type": "CogVideoControlNet", 431 | "pos": { 432 | "0": 220, 433 | "1": 155 434 | }, 435 | "size": { 436 | "0": 367.79998779296875, 437 | "1": 126 438 | }, 439 | "flags": {}, 440 | "order": 9, 441 | "mode": 0, 442 | "inputs": [ 443 | { 444 | "name": "controlnet", 445 | "type": "COGVIDECONTROLNETMODEL", 446 | "link": 67 447 | }, 448 | { 449 | "name": "images", 450 | "type": "IMAGE", 451 | "link": 72 452 | } 453 | ], 454 | "outputs": [ 455 | { 456 | "name": "cogvideo_controlnet", 457 | "type": "COGVIDECONTROLNET", 458 | "links": [ 459 | 86 460 | ], 461 | "slot_index": 0 462 | } 463 | ], 464 | "properties": { 465 | "Node name for S&R": "CogVideoControlNet" 466 | }, 467 | "widgets_values": [ 468 | 1, 469 | 0, 470 | 1 471 | ] 472 | }, 473 | { 474 | "id": 40, 475 | "type": "GetImageSizeAndCount", 476 | "pos": { 477 | "0": -123, 478 | "1": -34 479 | }, 480 | "size": { 481 | "0": 277.20001220703125, 482 | "1": 86 483 | }, 484 | "flags": {}, 485 | "order": 8, 486 | "mode": 0, 487 | "inputs": [ 488 | { 489 | "name": "image", 490 | "type": "IMAGE", 491 | "link": 71 492 | } 493 | ], 494 | "outputs": [ 495 | { 496 | "name": "image", 497 | "type": "IMAGE", 498 | "links": [ 499 | 72, 500 | 75 501 | ], 502 | "slot_index": 0 503 | }, 504 | { 505 | "name": "720 width", 506 | "type": "INT", 507 | "links": [ 508 | 89 509 | ] 510 | }, 511 | { 512 | "name": "480 height", 513 | "type": "INT", 514 | "links": [ 515 | 90 516 | ], 517 | "slot_index": 2 518 | }, 519 | { 520 | "name": "49 count", 521 | "type": "INT", 522 | "links": null 523 | } 524 | ], 525 | "properties": { 526 | "Node name for S&R": "GetImageSizeAndCount" 527 | }, 528 | "widgets_values": [] 529 | }, 530 | { 531 | "id": 47, 532 | "type": "EmptyLatentImage", 533 | "pos": { 534 | "0": 409, 535 | "1": 77 536 | }, 537 | "size": { 538 | "0": 315, 539 | "1": 106 540 | }, 541 | "flags": { 542 | "collapsed": true 543 | }, 544 | "order": 10, 545 | "mode": 0, 546 | "inputs": [ 547 | { 548 | "name": "width", 549 | "type": "INT", 550 | "link": 89, 551 | "widget": { 552 | "name": "width" 553 | } 554 | }, 555 | { 556 | "name": "height", 557 | "type": "INT", 558 | "link": 90, 559 | "widget": { 560 | "name": "height" 561 | } 562 | } 563 | ], 564 | "outputs": [ 565 | { 566 | "name": "LATENT", 567 | "type": "LATENT", 568 | "links": [ 569 | 88 570 | ] 571 | } 572 | ], 573 | "properties": { 574 | "Node name for S&R": "EmptyLatentImage" 575 | }, 576 | "widgets_values": [ 577 | 720, 578 | 480, 579 | 1 580 | ] 581 | }, 582 | { 583 | "id": 46, 584 | "type": "CogVideoSampler", 585 | "pos": { 586 | "0": 743, 587 | "1": 49 588 | }, 589 | "size": { 590 | "0": 330, 591 | "1": 574 592 | }, 593 | "flags": {}, 594 | "order": 11, 595 | "mode": 0, 596 | "inputs": [ 597 | { 598 | "name": "model", 599 | "type": "COGVIDEOMODEL", 600 | "link": 83 601 | }, 602 | { 603 | "name": "positive", 604 | "type": "CONDITIONING", 605 | "link": 84 606 | }, 607 | { 608 | "name": "negative", 609 | "type": "CONDITIONING", 610 | "link": 85 611 | }, 612 | { 613 | "name": "samples", 614 | "type": "LATENT", 615 | "link": 88, 616 | "shape": 7 617 | }, 618 | { 619 | "name": "image_cond_latents", 620 | "type": "LATENT", 621 | "link": null, 622 | "shape": 7 623 | }, 624 | { 625 | "name": "context_options", 626 | "type": "COGCONTEXT", 627 | "link": null, 628 | "shape": 7 629 | }, 630 | { 631 | "name": "controlnet", 632 | "type": "COGVIDECONTROLNET", 633 | "link": 86, 634 | "shape": 7 635 | }, 636 | { 637 | "name": "tora_trajectory", 638 | "type": "TORAFEATURES", 639 | "link": null, 640 | "shape": 7 641 | }, 642 | { 643 | "name": "fastercache", 644 | "type": "FASTERCACHEARGS", 645 | "link": null, 646 | "shape": 7 647 | } 648 | ], 649 | "outputs": [ 650 | { 651 | "name": "samples", 652 | "type": "LATENT", 653 | "links": [ 654 | 87 655 | ] 656 | } 657 | ], 658 | "properties": { 659 | "Node name for S&R": "CogVideoSampler" 660 | }, 661 | "widgets_values": [ 662 | 49, 663 | 40, 664 | 6, 665 | 0, 666 | "fixed", 667 | "CogVideoXDDIM", 668 | 1 669 | ] 670 | }, 671 | { 672 | "id": 45, 673 | "type": "CogVideoDecode", 674 | "pos": { 675 | "0": 758, 676 | "1": 685 677 | }, 678 | "size": { 679 | "0": 315, 680 | "1": 198 681 | }, 682 | "flags": {}, 683 | "order": 12, 684 | "mode": 0, 685 | "inputs": [ 686 | { 687 | "name": "vae", 688 | "type": "VAE", 689 | "link": 82 690 | }, 691 | { 692 | "name": "samples", 693 | "type": "LATENT", 694 | "link": 87 695 | } 696 | ], 697 | "outputs": [ 698 | { 699 | "name": "images", 700 | "type": "IMAGE", 701 | "links": [ 702 | 81 703 | ] 704 | } 705 | ], 706 | "properties": { 707 | "Node name for S&R": "CogVideoDecode" 708 | }, 709 | "widgets_values": [ 710 | true, 711 | 240, 712 | 360, 713 | 0.2, 714 | 0.2, 715 | true 716 | ] 717 | }, 718 | { 719 | "id": 42, 720 | "type": "ImageConcatMulti", 721 | "pos": { 722 | "0": 1145, 723 | "1": -24 724 | }, 725 | "size": { 726 | "0": 210, 727 | "1": 150 728 | }, 729 | "flags": {}, 730 | "order": 13, 731 | "mode": 0, 732 | "inputs": [ 733 | { 734 | "name": "image_1", 735 | "type": "IMAGE", 736 | "link": 75 737 | }, 738 | { 739 | "name": "image_2", 740 | "type": "IMAGE", 741 | "link": 81 742 | } 743 | ], 744 | "outputs": [ 745 | { 746 | "name": "images", 747 | "type": "IMAGE", 748 | "links": [ 749 | 77 750 | ], 751 | "slot_index": 0 752 | } 753 | ], 754 | "properties": {}, 755 | "widgets_values": [ 756 | 2, 757 | "right", 758 | false, 759 | null 760 | ] 761 | }, 762 | { 763 | "id": 43, 764 | "type": "VHS_VideoCombine", 765 | "pos": { 766 | "0": 1154, 767 | "1": 202 768 | }, 769 | "size": [ 770 | 778.7022705078125, 771 | 576.9007568359375 772 | ], 773 | "flags": {}, 774 | "order": 14, 775 | "mode": 0, 776 | "inputs": [ 777 | { 778 | "name": "images", 779 | "type": "IMAGE", 780 | "link": 77 781 | }, 782 | { 783 | "name": "audio", 784 | "type": "AUDIO", 785 | "link": null, 786 | "shape": 7 787 | }, 788 | { 789 | "name": "meta_batch", 790 | "type": "VHS_BatchManager", 791 | "link": null, 792 | "shape": 7 793 | }, 794 | { 795 | "name": "vae", 796 | "type": "VAE", 797 | "link": null, 798 | "shape": 7 799 | } 800 | ], 801 | "outputs": [ 802 | { 803 | "name": "Filenames", 804 | "type": "VHS_FILENAMES", 805 | "links": null, 806 | "shape": 3 807 | } 808 | ], 809 | "properties": { 810 | "Node name for S&R": "VHS_VideoCombine" 811 | }, 812 | "widgets_values": { 813 | "frame_rate": 8, 814 | "loop_count": 0, 815 | "filename_prefix": "CogVideoX_2b_controlnet", 816 | "format": "video/h264-mp4", 817 | "pix_fmt": "yuv420p", 818 | "crf": 19, 819 | "save_metadata": true, 820 | "pingpong": false, 821 | "save_output": true, 822 | "videopreview": { 823 | "hidden": false, 824 | "paused": false, 825 | "params": { 826 | "filename": "CogVideoX2B_controlnet_00003.mp4", 827 | "subfolder": "", 828 | "type": "temp", 829 | "format": "video/h264-mp4", 830 | "frame_rate": 8 831 | }, 832 | "muted": false 833 | } 834 | } 835 | } 836 | ], 837 | "links": [ 838 | [ 839 | 54, 840 | 20, 841 | 0, 842 | 30, 843 | 0, 844 | "CLIP" 845 | ], 846 | [ 847 | 67, 848 | 35, 849 | 0, 850 | 37, 851 | 0, 852 | "COGVIDECONTROLNETMODEL" 853 | ], 854 | [ 855 | 71, 856 | 39, 857 | 0, 858 | 40, 859 | 0, 860 | "IMAGE" 861 | ], 862 | [ 863 | 72, 864 | 40, 865 | 0, 866 | 37, 867 | 1, 868 | "IMAGE" 869 | ], 870 | [ 871 | 73, 872 | 38, 873 | 0, 874 | 41, 875 | 0, 876 | "IMAGE" 877 | ], 878 | [ 879 | 74, 880 | 41, 881 | 0, 882 | 39, 883 | 0, 884 | "IMAGE" 885 | ], 886 | [ 887 | 75, 888 | 40, 889 | 0, 890 | 42, 891 | 0, 892 | "IMAGE" 893 | ], 894 | [ 895 | 77, 896 | 42, 897 | 0, 898 | 43, 899 | 0, 900 | "IMAGE" 901 | ], 902 | [ 903 | 78, 904 | 30, 905 | 1, 906 | 31, 907 | 0, 908 | "CLIP" 909 | ], 910 | [ 911 | 81, 912 | 45, 913 | 0, 914 | 42, 915 | 1, 916 | "IMAGE" 917 | ], 918 | [ 919 | 82, 920 | 44, 921 | 1, 922 | 45, 923 | 0, 924 | "VAE" 925 | ], 926 | [ 927 | 83, 928 | 44, 929 | 0, 930 | 46, 931 | 0, 932 | "COGVIDEOMODEL" 933 | ], 934 | [ 935 | 84, 936 | 30, 937 | 0, 938 | 46, 939 | 1, 940 | "CONDITIONING" 941 | ], 942 | [ 943 | 85, 944 | 31, 945 | 0, 946 | 46, 947 | 2, 948 | "CONDITIONING" 949 | ], 950 | [ 951 | 86, 952 | 37, 953 | 0, 954 | 46, 955 | 6, 956 | "COGVIDECONTROLNET" 957 | ], 958 | [ 959 | 87, 960 | 46, 961 | 0, 962 | 45, 963 | 1, 964 | "LATENT" 965 | ], 966 | [ 967 | 88, 968 | 47, 969 | 0, 970 | 46, 971 | 3, 972 | "LATENT" 973 | ], 974 | [ 975 | 89, 976 | 40, 977 | 1, 978 | 47, 979 | 0, 980 | "INT" 981 | ], 982 | [ 983 | 90, 984 | 40, 985 | 2, 986 | 47, 987 | 1, 988 | "INT" 989 | ] 990 | ], 991 | "groups": [], 992 | "config": {}, 993 | "extra": { 994 | "ds": { 995 | "scale": 0.7627768444387069, 996 | "offset": [ 997 | 1075.4957551311677, 998 | 398.4420252790512 999 | ] 1000 | } 1001 | }, 1002 | "version": 0.4 1003 | } -------------------------------------------------------------------------------- /example_workflows/cogvideox_1_0_5b_I2V_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 63, 3 | "last_link_id": 149, 4 | "nodes": [ 5 | { 6 | "id": 31, 7 | "type": "CogVideoTextEncode", 8 | "pos": { 9 | "0": 497, 10 | "1": 520 11 | }, 12 | "size": { 13 | "0": 463.01251220703125, 14 | "1": 144 15 | }, 16 | "flags": {}, 17 | "order": 6, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "clip", 22 | "type": "CLIP", 23 | "link": 149 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "conditioning", 29 | "type": "CONDITIONING", 30 | "links": [ 31 | 146 32 | ], 33 | "slot_index": 0, 34 | "shape": 3 35 | }, 36 | { 37 | "name": "clip", 38 | "type": "CLIP", 39 | "links": null 40 | } 41 | ], 42 | "properties": { 43 | "Node name for S&R": "CogVideoTextEncode" 44 | }, 45 | "widgets_values": [ 46 | "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", 47 | 1, 48 | true 49 | ] 50 | }, 51 | { 52 | "id": 63, 53 | "type": "CogVideoSampler", 54 | "pos": { 55 | "0": 1142, 56 | "1": 74 57 | }, 58 | "size": [ 59 | 330, 60 | 574 61 | ], 62 | "flags": {}, 63 | "order": 7, 64 | "mode": 0, 65 | "inputs": [ 66 | { 67 | "name": "model", 68 | "type": "COGVIDEOMODEL", 69 | "link": 144 70 | }, 71 | { 72 | "name": "positive", 73 | "type": "CONDITIONING", 74 | "link": 145 75 | }, 76 | { 77 | "name": "negative", 78 | "type": "CONDITIONING", 79 | "link": 146 80 | }, 81 | { 82 | "name": "samples", 83 | "type": "LATENT", 84 | "link": null, 85 | "shape": 7 86 | }, 87 | { 88 | "name": "image_cond_latents", 89 | "type": "LATENT", 90 | "link": 147, 91 | "shape": 7 92 | }, 93 | { 94 | "name": "context_options", 95 | "type": "COGCONTEXT", 96 | "link": null, 97 | "shape": 7 98 | }, 99 | { 100 | "name": "controlnet", 101 | "type": "COGVIDECONTROLNET", 102 | "link": null, 103 | "shape": 7 104 | }, 105 | { 106 | "name": "tora_trajectory", 107 | "type": "TORAFEATURES", 108 | "link": null, 109 | "shape": 7 110 | }, 111 | { 112 | "name": "fastercache", 113 | "type": "FASTERCACHEARGS", 114 | "link": null, 115 | "shape": 7 116 | } 117 | ], 118 | "outputs": [ 119 | { 120 | "name": "samples", 121 | "type": "LATENT", 122 | "links": [ 123 | 148 124 | ] 125 | } 126 | ], 127 | "properties": { 128 | "Node name for S&R": "CogVideoSampler" 129 | }, 130 | "widgets_values": [ 131 | 49, 132 | 25, 133 | 6, 134 | 0, 135 | "fixed", 136 | "CogVideoXDDIM", 137 | 1 138 | ] 139 | }, 140 | { 141 | "id": 62, 142 | "type": "CogVideoImageEncode", 143 | "pos": { 144 | "0": 1149, 145 | "1": 711 146 | }, 147 | "size": { 148 | "0": 315, 149 | "1": 122 150 | }, 151 | "flags": {}, 152 | "order": 5, 153 | "mode": 0, 154 | "inputs": [ 155 | { 156 | "name": "vae", 157 | "type": "VAE", 158 | "link": 141 159 | }, 160 | { 161 | "name": "start_image", 162 | "type": "IMAGE", 163 | "link": 142 164 | }, 165 | { 166 | "name": "end_image", 167 | "type": "IMAGE", 168 | "link": null, 169 | "shape": 7 170 | } 171 | ], 172 | "outputs": [ 173 | { 174 | "name": "samples", 175 | "type": "LATENT", 176 | "links": [ 177 | 147 178 | ] 179 | } 180 | ], 181 | "properties": { 182 | "Node name for S&R": "CogVideoImageEncode" 183 | }, 184 | "widgets_values": [ 185 | false, 186 | 0 187 | ] 188 | }, 189 | { 190 | "id": 59, 191 | "type": "DownloadAndLoadCogVideoModel", 192 | "pos": { 193 | "0": 622, 194 | "1": -25 195 | }, 196 | "size": { 197 | "0": 315, 198 | "1": 218 199 | }, 200 | "flags": {}, 201 | "order": 0, 202 | "mode": 0, 203 | "inputs": [ 204 | { 205 | "name": "block_edit", 206 | "type": "TRANSFORMERBLOCKS", 207 | "link": null, 208 | "shape": 7 209 | }, 210 | { 211 | "name": "lora", 212 | "type": "COGLORA", 213 | "link": null, 214 | "shape": 7 215 | }, 216 | { 217 | "name": "compile_args", 218 | "type": "COMPILEARGS", 219 | "link": null, 220 | "shape": 7 221 | } 222 | ], 223 | "outputs": [ 224 | { 225 | "name": "model", 226 | "type": "COGVIDEOMODEL", 227 | "links": [ 228 | 144 229 | ] 230 | }, 231 | { 232 | "name": "vae", 233 | "type": "VAE", 234 | "links": [ 235 | 132, 236 | 141 237 | ], 238 | "slot_index": 1 239 | } 240 | ], 241 | "properties": { 242 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 243 | }, 244 | "widgets_values": [ 245 | "THUDM/CogVideoX-5b-I2V", 246 | "bf16", 247 | "disabled", 248 | false, 249 | "sdpa", 250 | "main_device" 251 | ] 252 | }, 253 | { 254 | "id": 30, 255 | "type": "CogVideoTextEncode", 256 | "pos": { 257 | "0": 493, 258 | "1": 303 259 | }, 260 | "size": { 261 | "0": 471.90142822265625, 262 | "1": 168.08047485351562 263 | }, 264 | "flags": {}, 265 | "order": 4, 266 | "mode": 0, 267 | "inputs": [ 268 | { 269 | "name": "clip", 270 | "type": "CLIP", 271 | "link": 54 272 | } 273 | ], 274 | "outputs": [ 275 | { 276 | "name": "conditioning", 277 | "type": "CONDITIONING", 278 | "links": [ 279 | 145 280 | ], 281 | "slot_index": 0, 282 | "shape": 3 283 | }, 284 | { 285 | "name": "clip", 286 | "type": "CLIP", 287 | "links": [ 288 | 149 289 | ], 290 | "slot_index": 1 291 | } 292 | ], 293 | "properties": { 294 | "Node name for S&R": "CogVideoTextEncode" 295 | }, 296 | "widgets_values": [ 297 | "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees", 298 | 1, 299 | false 300 | ] 301 | }, 302 | { 303 | "id": 37, 304 | "type": "ImageResizeKJ", 305 | "pos": { 306 | "0": 784, 307 | "1": 731 308 | }, 309 | "size": { 310 | "0": 315, 311 | "1": 266 312 | }, 313 | "flags": {}, 314 | "order": 3, 315 | "mode": 0, 316 | "inputs": [ 317 | { 318 | "name": "image", 319 | "type": "IMAGE", 320 | "link": 71 321 | }, 322 | { 323 | "name": "get_image_size", 324 | "type": "IMAGE", 325 | "link": null, 326 | "shape": 7 327 | }, 328 | { 329 | "name": "width_input", 330 | "type": "INT", 331 | "link": null, 332 | "widget": { 333 | "name": "width_input" 334 | } 335 | }, 336 | { 337 | "name": "height_input", 338 | "type": "INT", 339 | "link": null, 340 | "widget": { 341 | "name": "height_input" 342 | } 343 | } 344 | ], 345 | "outputs": [ 346 | { 347 | "name": "IMAGE", 348 | "type": "IMAGE", 349 | "links": [ 350 | 142 351 | ], 352 | "slot_index": 0, 353 | "shape": 3 354 | }, 355 | { 356 | "name": "width", 357 | "type": "INT", 358 | "links": null, 359 | "shape": 3 360 | }, 361 | { 362 | "name": "height", 363 | "type": "INT", 364 | "links": null, 365 | "shape": 3 366 | } 367 | ], 368 | "properties": { 369 | "Node name for S&R": "ImageResizeKJ" 370 | }, 371 | "widgets_values": [ 372 | 720, 373 | 480, 374 | "lanczos", 375 | false, 376 | 16, 377 | 0, 378 | 0, 379 | "disabled" 380 | ] 381 | }, 382 | { 383 | "id": 36, 384 | "type": "LoadImage", 385 | "pos": { 386 | "0": 335, 387 | "1": 731 388 | }, 389 | "size": { 390 | "0": 402.06353759765625, 391 | "1": 396.6225891113281 392 | }, 393 | "flags": {}, 394 | "order": 1, 395 | "mode": 0, 396 | "inputs": [], 397 | "outputs": [ 398 | { 399 | "name": "IMAGE", 400 | "type": "IMAGE", 401 | "links": [ 402 | 71 403 | ], 404 | "slot_index": 0, 405 | "shape": 3 406 | }, 407 | { 408 | "name": "MASK", 409 | "type": "MASK", 410 | "links": null, 411 | "shape": 3 412 | } 413 | ], 414 | "properties": { 415 | "Node name for S&R": "LoadImage" 416 | }, 417 | "widgets_values": [ 418 | "sd3stag.png", 419 | "image" 420 | ] 421 | }, 422 | { 423 | "id": 20, 424 | "type": "CLIPLoader", 425 | "pos": { 426 | "0": -2, 427 | "1": 304 428 | }, 429 | "size": { 430 | "0": 451.30548095703125, 431 | "1": 82 432 | }, 433 | "flags": {}, 434 | "order": 2, 435 | "mode": 0, 436 | "inputs": [], 437 | "outputs": [ 438 | { 439 | "name": "CLIP", 440 | "type": "CLIP", 441 | "links": [ 442 | 54 443 | ], 444 | "slot_index": 0, 445 | "shape": 3 446 | } 447 | ], 448 | "properties": { 449 | "Node name for S&R": "CLIPLoader" 450 | }, 451 | "widgets_values": [ 452 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 453 | "sd3" 454 | ] 455 | }, 456 | { 457 | "id": 60, 458 | "type": "CogVideoDecode", 459 | "pos": { 460 | "0": 1523, 461 | "1": -6 462 | }, 463 | "size": { 464 | "0": 315, 465 | "1": 198 466 | }, 467 | "flags": {}, 468 | "order": 8, 469 | "mode": 0, 470 | "inputs": [ 471 | { 472 | "name": "vae", 473 | "type": "VAE", 474 | "link": 132 475 | }, 476 | { 477 | "name": "samples", 478 | "type": "LATENT", 479 | "link": 148 480 | } 481 | ], 482 | "outputs": [ 483 | { 484 | "name": "images", 485 | "type": "IMAGE", 486 | "links": [ 487 | 134 488 | ] 489 | } 490 | ], 491 | "properties": { 492 | "Node name for S&R": "CogVideoDecode" 493 | }, 494 | "widgets_values": [ 495 | true, 496 | 240, 497 | 360, 498 | 0.2, 499 | 0.2, 500 | true 501 | ] 502 | }, 503 | { 504 | "id": 44, 505 | "type": "VHS_VideoCombine", 506 | "pos": { 507 | "0": 1884, 508 | "1": -6 509 | }, 510 | "size": [ 511 | 605.3909912109375, 512 | 714.2606608072917 513 | ], 514 | "flags": {}, 515 | "order": 9, 516 | "mode": 0, 517 | "inputs": [ 518 | { 519 | "name": "images", 520 | "type": "IMAGE", 521 | "link": 134 522 | }, 523 | { 524 | "name": "audio", 525 | "type": "AUDIO", 526 | "link": null, 527 | "shape": 7 528 | }, 529 | { 530 | "name": "meta_batch", 531 | "type": "VHS_BatchManager", 532 | "link": null, 533 | "shape": 7 534 | }, 535 | { 536 | "name": "vae", 537 | "type": "VAE", 538 | "link": null, 539 | "shape": 7 540 | } 541 | ], 542 | "outputs": [ 543 | { 544 | "name": "Filenames", 545 | "type": "VHS_FILENAMES", 546 | "links": null, 547 | "shape": 3 548 | } 549 | ], 550 | "properties": { 551 | "Node name for S&R": "VHS_VideoCombine" 552 | }, 553 | "widgets_values": { 554 | "frame_rate": 8, 555 | "loop_count": 0, 556 | "filename_prefix": "CogVideoX-I2V", 557 | "format": "video/h264-mp4", 558 | "pix_fmt": "yuv420p", 559 | "crf": 19, 560 | "save_metadata": true, 561 | "pingpong": false, 562 | "save_output": true, 563 | "videopreview": { 564 | "hidden": false, 565 | "paused": false, 566 | "params": { 567 | "filename": "CogVideoX-I2V_00001.mp4", 568 | "subfolder": "", 569 | "type": "temp", 570 | "format": "video/h264-mp4", 571 | "frame_rate": 8 572 | }, 573 | "muted": false 574 | } 575 | } 576 | } 577 | ], 578 | "links": [ 579 | [ 580 | 54, 581 | 20, 582 | 0, 583 | 30, 584 | 0, 585 | "CLIP" 586 | ], 587 | [ 588 | 71, 589 | 36, 590 | 0, 591 | 37, 592 | 0, 593 | "IMAGE" 594 | ], 595 | [ 596 | 132, 597 | 59, 598 | 1, 599 | 60, 600 | 0, 601 | "VAE" 602 | ], 603 | [ 604 | 134, 605 | 60, 606 | 0, 607 | 44, 608 | 0, 609 | "IMAGE" 610 | ], 611 | [ 612 | 141, 613 | 59, 614 | 1, 615 | 62, 616 | 0, 617 | "VAE" 618 | ], 619 | [ 620 | 142, 621 | 37, 622 | 0, 623 | 62, 624 | 1, 625 | "IMAGE" 626 | ], 627 | [ 628 | 144, 629 | 59, 630 | 0, 631 | 63, 632 | 0, 633 | "COGVIDEOMODEL" 634 | ], 635 | [ 636 | 145, 637 | 30, 638 | 0, 639 | 63, 640 | 1, 641 | "CONDITIONING" 642 | ], 643 | [ 644 | 146, 645 | 31, 646 | 0, 647 | 63, 648 | 2, 649 | "CONDITIONING" 650 | ], 651 | [ 652 | 147, 653 | 62, 654 | 0, 655 | 63, 656 | 4, 657 | "LATENT" 658 | ], 659 | [ 660 | 148, 661 | 63, 662 | 0, 663 | 60, 664 | 1, 665 | "LATENT" 666 | ], 667 | [ 668 | 149, 669 | 30, 670 | 1, 671 | 31, 672 | 0, 673 | "CLIP" 674 | ] 675 | ], 676 | "groups": [], 677 | "config": {}, 678 | "extra": { 679 | "ds": { 680 | "scale": 0.7627768444387059, 681 | "offset": [ 682 | 648.7113591814891, 683 | 185.9907078691075 684 | ] 685 | } 686 | }, 687 | "version": 0.4 688 | } -------------------------------------------------------------------------------- /example_workflows/cogvideox_1_0_5b_T2V_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 37, 3 | "last_link_id": 72, 4 | "nodes": [ 5 | { 6 | "id": 30, 7 | "type": "CogVideoTextEncode", 8 | "pos": { 9 | "0": 500, 10 | "1": 308 11 | }, 12 | "size": [ 13 | 470.99399664051055, 14 | 237.5088638951354 15 | ], 16 | "flags": {}, 17 | "order": 3, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "clip", 22 | "type": "CLIP", 23 | "link": 54 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "conditioning", 29 | "type": "CONDITIONING", 30 | "links": [ 31 | 67 32 | ], 33 | "slot_index": 0, 34 | "shape": 3 35 | }, 36 | { 37 | "name": "clip", 38 | "type": "CLIP", 39 | "links": [ 40 | 65 41 | ], 42 | "slot_index": 1 43 | } 44 | ], 45 | "properties": { 46 | "Node name for S&R": "CogVideoTextEncode" 47 | }, 48 | "widgets_values": [ 49 | "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n", 50 | 1, 51 | false 52 | ] 53 | }, 54 | { 55 | "id": 31, 56 | "type": "CogVideoTextEncode", 57 | "pos": { 58 | "0": 503, 59 | "1": 602 60 | }, 61 | "size": [ 62 | 464.4980515341475, 63 | 169.87479027400514 64 | ], 65 | "flags": {}, 66 | "order": 4, 67 | "mode": 0, 68 | "inputs": [ 69 | { 70 | "name": "clip", 71 | "type": "CLIP", 72 | "link": 65 73 | } 74 | ], 75 | "outputs": [ 76 | { 77 | "name": "conditioning", 78 | "type": "CONDITIONING", 79 | "links": [ 80 | 68 81 | ], 82 | "slot_index": 0, 83 | "shape": 3 84 | }, 85 | { 86 | "name": "clip", 87 | "type": "CLIP", 88 | "links": null 89 | } 90 | ], 91 | "properties": { 92 | "Node name for S&R": "CogVideoTextEncode" 93 | }, 94 | "widgets_values": [ 95 | "", 96 | 1, 97 | true 98 | ] 99 | }, 100 | { 101 | "id": 11, 102 | "type": "CogVideoDecode", 103 | "pos": { 104 | "0": 1416, 105 | "1": 40 106 | }, 107 | "size": { 108 | "0": 300.396484375, 109 | "1": 198 110 | }, 111 | "flags": {}, 112 | "order": 6, 113 | "mode": 0, 114 | "inputs": [ 115 | { 116 | "name": "vae", 117 | "type": "VAE", 118 | "link": 71 119 | }, 120 | { 121 | "name": "samples", 122 | "type": "LATENT", 123 | "link": 69 124 | } 125 | ], 126 | "outputs": [ 127 | { 128 | "name": "images", 129 | "type": "IMAGE", 130 | "links": [ 131 | 59 132 | ], 133 | "slot_index": 0, 134 | "shape": 3 135 | } 136 | ], 137 | "properties": { 138 | "Node name for S&R": "CogVideoDecode" 139 | }, 140 | "widgets_values": [ 141 | false, 142 | 240, 143 | 360, 144 | 0.2, 145 | 0.2, 146 | true 147 | ] 148 | }, 149 | { 150 | "id": 36, 151 | "type": "DownloadAndLoadCogVideoModel", 152 | "pos": { 153 | "0": 645, 154 | "1": 17 155 | }, 156 | "size": { 157 | "0": 315, 158 | "1": 218 159 | }, 160 | "flags": {}, 161 | "order": 0, 162 | "mode": 0, 163 | "inputs": [ 164 | { 165 | "name": "block_edit", 166 | "type": "TRANSFORMERBLOCKS", 167 | "link": null, 168 | "shape": 7 169 | }, 170 | { 171 | "name": "lora", 172 | "type": "COGLORA", 173 | "link": null, 174 | "shape": 7 175 | }, 176 | { 177 | "name": "compile_args", 178 | "type": "COMPILEARGS", 179 | "link": null, 180 | "shape": 7 181 | } 182 | ], 183 | "outputs": [ 184 | { 185 | "name": "model", 186 | "type": "COGVIDEOMODEL", 187 | "links": [ 188 | 70 189 | ] 190 | }, 191 | { 192 | "name": "vae", 193 | "type": "VAE", 194 | "links": [ 195 | 71 196 | ], 197 | "slot_index": 1 198 | } 199 | ], 200 | "properties": { 201 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 202 | }, 203 | "widgets_values": [ 204 | "THUDM/CogVideoX-5b", 205 | "bf16", 206 | "disabled", 207 | false, 208 | "sdpa", 209 | "main_device" 210 | ] 211 | }, 212 | { 213 | "id": 20, 214 | "type": "CLIPLoader", 215 | "pos": { 216 | "0": 5, 217 | "1": 308 218 | }, 219 | "size": { 220 | "0": 451.30548095703125, 221 | "1": 82 222 | }, 223 | "flags": {}, 224 | "order": 1, 225 | "mode": 0, 226 | "inputs": [], 227 | "outputs": [ 228 | { 229 | "name": "CLIP", 230 | "type": "CLIP", 231 | "links": [ 232 | 54 233 | ], 234 | "slot_index": 0, 235 | "shape": 3 236 | } 237 | ], 238 | "properties": { 239 | "Node name for S&R": "CLIPLoader" 240 | }, 241 | "widgets_values": [ 242 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 243 | "sd3" 244 | ] 245 | }, 246 | { 247 | "id": 37, 248 | "type": "EmptyLatentImage", 249 | "pos": { 250 | "0": 643, 251 | "1": 827 252 | }, 253 | "size": { 254 | "0": 315, 255 | "1": 106 256 | }, 257 | "flags": {}, 258 | "order": 2, 259 | "mode": 0, 260 | "inputs": [], 261 | "outputs": [ 262 | { 263 | "name": "LATENT", 264 | "type": "LATENT", 265 | "links": [ 266 | 72 267 | ] 268 | } 269 | ], 270 | "properties": { 271 | "Node name for S&R": "EmptyLatentImage" 272 | }, 273 | "widgets_values": [ 274 | 720, 275 | 480, 276 | 1 277 | ] 278 | }, 279 | { 280 | "id": 35, 281 | "type": "CogVideoSampler", 282 | "pos": { 283 | "0": 1042, 284 | "1": 291 285 | }, 286 | "size": [ 287 | 330, 288 | 574 289 | ], 290 | "flags": {}, 291 | "order": 5, 292 | "mode": 0, 293 | "inputs": [ 294 | { 295 | "name": "model", 296 | "type": "COGVIDEOMODEL", 297 | "link": 70 298 | }, 299 | { 300 | "name": "positive", 301 | "type": "CONDITIONING", 302 | "link": 67 303 | }, 304 | { 305 | "name": "negative", 306 | "type": "CONDITIONING", 307 | "link": 68 308 | }, 309 | { 310 | "name": "samples", 311 | "type": "LATENT", 312 | "link": 72, 313 | "shape": 7 314 | }, 315 | { 316 | "name": "image_cond_latents", 317 | "type": "LATENT", 318 | "link": null, 319 | "shape": 7 320 | }, 321 | { 322 | "name": "context_options", 323 | "type": "COGCONTEXT", 324 | "link": null, 325 | "shape": 7 326 | }, 327 | { 328 | "name": "controlnet", 329 | "type": "COGVIDECONTROLNET", 330 | "link": null, 331 | "shape": 7 332 | }, 333 | { 334 | "name": "tora_trajectory", 335 | "type": "TORAFEATURES", 336 | "link": null, 337 | "shape": 7 338 | }, 339 | { 340 | "name": "fastercache", 341 | "type": "FASTERCACHEARGS", 342 | "link": null, 343 | "shape": 7 344 | } 345 | ], 346 | "outputs": [ 347 | { 348 | "name": "samples", 349 | "type": "LATENT", 350 | "links": [ 351 | 69 352 | ] 353 | } 354 | ], 355 | "properties": { 356 | "Node name for S&R": "CogVideoSampler" 357 | }, 358 | "widgets_values": [ 359 | 49, 360 | 50, 361 | 6, 362 | 0, 363 | "fixed", 364 | "CogVideoXDDIM", 365 | 1 366 | ] 367 | }, 368 | { 369 | "id": 33, 370 | "type": "VHS_VideoCombine", 371 | "pos": { 372 | "0": 1767, 373 | "1": 39 374 | }, 375 | "size": [ 376 | 778.7022705078125, 377 | 829.801513671875 378 | ], 379 | "flags": {}, 380 | "order": 7, 381 | "mode": 0, 382 | "inputs": [ 383 | { 384 | "name": "images", 385 | "type": "IMAGE", 386 | "link": 59 387 | }, 388 | { 389 | "name": "audio", 390 | "type": "AUDIO", 391 | "link": null, 392 | "shape": 7 393 | }, 394 | { 395 | "name": "meta_batch", 396 | "type": "VHS_BatchManager", 397 | "link": null, 398 | "shape": 7 399 | }, 400 | { 401 | "name": "vae", 402 | "type": "VAE", 403 | "link": null, 404 | "shape": 7 405 | } 406 | ], 407 | "outputs": [ 408 | { 409 | "name": "Filenames", 410 | "type": "VHS_FILENAMES", 411 | "links": null, 412 | "shape": 3 413 | } 414 | ], 415 | "properties": { 416 | "Node name for S&R": "VHS_VideoCombine" 417 | }, 418 | "widgets_values": { 419 | "frame_rate": 8, 420 | "loop_count": 0, 421 | "filename_prefix": "CogVideoX5B-T2V", 422 | "format": "video/h264-mp4", 423 | "pix_fmt": "yuv420p", 424 | "crf": 19, 425 | "save_metadata": true, 426 | "pingpong": false, 427 | "save_output": false, 428 | "videopreview": { 429 | "hidden": false, 430 | "paused": false, 431 | "params": { 432 | "filename": "CogVideoX5B_00001.mp4", 433 | "subfolder": "", 434 | "type": "temp", 435 | "format": "video/h264-mp4", 436 | "frame_rate": 8 437 | }, 438 | "muted": false 439 | } 440 | } 441 | } 442 | ], 443 | "links": [ 444 | [ 445 | 54, 446 | 20, 447 | 0, 448 | 30, 449 | 0, 450 | "CLIP" 451 | ], 452 | [ 453 | 59, 454 | 11, 455 | 0, 456 | 33, 457 | 0, 458 | "IMAGE" 459 | ], 460 | [ 461 | 65, 462 | 30, 463 | 1, 464 | 31, 465 | 0, 466 | "CLIP" 467 | ], 468 | [ 469 | 67, 470 | 30, 471 | 0, 472 | 35, 473 | 1, 474 | "CONDITIONING" 475 | ], 476 | [ 477 | 68, 478 | 31, 479 | 0, 480 | 35, 481 | 2, 482 | "CONDITIONING" 483 | ], 484 | [ 485 | 69, 486 | 35, 487 | 0, 488 | 11, 489 | 1, 490 | "LATENT" 491 | ], 492 | [ 493 | 70, 494 | 36, 495 | 0, 496 | 35, 497 | 0, 498 | "COGVIDEOMODEL" 499 | ], 500 | [ 501 | 71, 502 | 36, 503 | 1, 504 | 11, 505 | 0, 506 | "VAE" 507 | ], 508 | [ 509 | 72, 510 | 37, 511 | 0, 512 | 35, 513 | 3, 514 | "LATENT" 515 | ] 516 | ], 517 | "groups": [], 518 | "config": {}, 519 | "extra": { 520 | "ds": { 521 | "scale": 0.7627768444387061, 522 | "offset": [ 523 | 734.1791945221892, 524 | 237.29437844909364 525 | ] 526 | } 527 | }, 528 | "version": 0.4 529 | } -------------------------------------------------------------------------------- /example_workflows/cogvideox_1_0_5b_interpolation_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 68, 3 | "last_link_id": 155, 4 | "nodes": [ 5 | { 6 | "id": 31, 7 | "type": "CogVideoTextEncode", 8 | "pos": { 9 | "0": 497, 10 | "1": 520 11 | }, 12 | "size": { 13 | "0": 463.01251220703125, 14 | "1": 144 15 | }, 16 | "flags": {}, 17 | "order": 6, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "clip", 22 | "type": "CLIP", 23 | "link": 149 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "conditioning", 29 | "type": "CONDITIONING", 30 | "links": [ 31 | 146 32 | ], 33 | "slot_index": 0, 34 | "shape": 3 35 | }, 36 | { 37 | "name": "clip", 38 | "type": "CLIP", 39 | "links": null 40 | } 41 | ], 42 | "properties": { 43 | "Node name for S&R": "CogVideoTextEncode" 44 | }, 45 | "widgets_values": [ 46 | "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", 47 | 1, 48 | true 49 | ] 50 | }, 51 | { 52 | "id": 63, 53 | "type": "CogVideoSampler", 54 | "pos": { 55 | "0": 1142, 56 | "1": 74 57 | }, 58 | "size": [ 59 | 330, 60 | 574 61 | ], 62 | "flags": {}, 63 | "order": 9, 64 | "mode": 0, 65 | "inputs": [ 66 | { 67 | "name": "model", 68 | "type": "COGVIDEOMODEL", 69 | "link": 144 70 | }, 71 | { 72 | "name": "positive", 73 | "type": "CONDITIONING", 74 | "link": 145 75 | }, 76 | { 77 | "name": "negative", 78 | "type": "CONDITIONING", 79 | "link": 146 80 | }, 81 | { 82 | "name": "samples", 83 | "type": "LATENT", 84 | "link": null, 85 | "shape": 7 86 | }, 87 | { 88 | "name": "image_cond_latents", 89 | "type": "LATENT", 90 | "link": 147, 91 | "shape": 7 92 | }, 93 | { 94 | "name": "context_options", 95 | "type": "COGCONTEXT", 96 | "link": null, 97 | "shape": 7 98 | }, 99 | { 100 | "name": "controlnet", 101 | "type": "COGVIDECONTROLNET", 102 | "link": null, 103 | "shape": 7 104 | }, 105 | { 106 | "name": "tora_trajectory", 107 | "type": "TORAFEATURES", 108 | "link": null, 109 | "shape": 7 110 | }, 111 | { 112 | "name": "fastercache", 113 | "type": "FASTERCACHEARGS", 114 | "link": null, 115 | "shape": 7 116 | } 117 | ], 118 | "outputs": [ 119 | { 120 | "name": "samples", 121 | "type": "LATENT", 122 | "links": [ 123 | 148 124 | ] 125 | } 126 | ], 127 | "properties": { 128 | "Node name for S&R": "CogVideoSampler" 129 | }, 130 | "widgets_values": [ 131 | 49, 132 | 25, 133 | 6, 134 | 0, 135 | "fixed", 136 | "CogVideoXDDIM", 137 | 1 138 | ] 139 | }, 140 | { 141 | "id": 30, 142 | "type": "CogVideoTextEncode", 143 | "pos": { 144 | "0": 493, 145 | "1": 303 146 | }, 147 | "size": { 148 | "0": 471.90142822265625, 149 | "1": 168.08047485351562 150 | }, 151 | "flags": {}, 152 | "order": 4, 153 | "mode": 0, 154 | "inputs": [ 155 | { 156 | "name": "clip", 157 | "type": "CLIP", 158 | "link": 54 159 | } 160 | ], 161 | "outputs": [ 162 | { 163 | "name": "conditioning", 164 | "type": "CONDITIONING", 165 | "links": [ 166 | 145 167 | ], 168 | "slot_index": 0, 169 | "shape": 3 170 | }, 171 | { 172 | "name": "clip", 173 | "type": "CLIP", 174 | "links": [ 175 | 149 176 | ], 177 | "slot_index": 1 178 | } 179 | ], 180 | "properties": { 181 | "Node name for S&R": "CogVideoTextEncode" 182 | }, 183 | "widgets_values": [ 184 | "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees", 185 | 1, 186 | false 187 | ] 188 | }, 189 | { 190 | "id": 20, 191 | "type": "CLIPLoader", 192 | "pos": { 193 | "0": -2, 194 | "1": 304 195 | }, 196 | "size": { 197 | "0": 451.30548095703125, 198 | "1": 82 199 | }, 200 | "flags": {}, 201 | "order": 0, 202 | "mode": 0, 203 | "inputs": [], 204 | "outputs": [ 205 | { 206 | "name": "CLIP", 207 | "type": "CLIP", 208 | "links": [ 209 | 54 210 | ], 211 | "slot_index": 0, 212 | "shape": 3 213 | } 214 | ], 215 | "properties": { 216 | "Node name for S&R": "CLIPLoader" 217 | }, 218 | "widgets_values": [ 219 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 220 | "sd3" 221 | ] 222 | }, 223 | { 224 | "id": 36, 225 | "type": "LoadImage", 226 | "pos": { 227 | "0": 105, 228 | "1": 732 229 | }, 230 | "size": { 231 | "0": 402.06353759765625, 232 | "1": 396.6225891113281 233 | }, 234 | "flags": {}, 235 | "order": 1, 236 | "mode": 0, 237 | "inputs": [], 238 | "outputs": [ 239 | { 240 | "name": "IMAGE", 241 | "type": "IMAGE", 242 | "links": [ 243 | 71 244 | ], 245 | "slot_index": 0, 246 | "shape": 3 247 | }, 248 | { 249 | "name": "MASK", 250 | "type": "MASK", 251 | "links": null, 252 | "shape": 3 253 | } 254 | ], 255 | "properties": { 256 | "Node name for S&R": "LoadImage" 257 | }, 258 | "widgets_values": [ 259 | "sd3stag.png", 260 | "image" 261 | ] 262 | }, 263 | { 264 | "id": 64, 265 | "type": "LoadImage", 266 | "pos": { 267 | "0": 105, 268 | "1": 1189 269 | }, 270 | "size": { 271 | "0": 402.06353759765625, 272 | "1": 396.6225891113281 273 | }, 274 | "flags": {}, 275 | "order": 2, 276 | "mode": 0, 277 | "inputs": [], 278 | "outputs": [ 279 | { 280 | "name": "IMAGE", 281 | "type": "IMAGE", 282 | "links": [ 283 | 151 284 | ], 285 | "slot_index": 0, 286 | "shape": 3 287 | }, 288 | { 289 | "name": "MASK", 290 | "type": "MASK", 291 | "links": null, 292 | "shape": 3 293 | } 294 | ], 295 | "properties": { 296 | "Node name for S&R": "LoadImage" 297 | }, 298 | "widgets_values": [ 299 | "sd3stag.png", 300 | "image" 301 | ] 302 | }, 303 | { 304 | "id": 65, 305 | "type": "ImageResizeKJ", 306 | "pos": { 307 | "0": 607, 308 | "1": 1188 309 | }, 310 | "size": [ 311 | 315, 312 | 266 313 | ], 314 | "flags": {}, 315 | "order": 7, 316 | "mode": 0, 317 | "inputs": [ 318 | { 319 | "name": "image", 320 | "type": "IMAGE", 321 | "link": 151 322 | }, 323 | { 324 | "name": "get_image_size", 325 | "type": "IMAGE", 326 | "link": null, 327 | "shape": 7 328 | }, 329 | { 330 | "name": "width_input", 331 | "type": "INT", 332 | "link": null, 333 | "widget": { 334 | "name": "width_input" 335 | }, 336 | "shape": 7 337 | }, 338 | { 339 | "name": "height_input", 340 | "type": "INT", 341 | "link": null, 342 | "widget": { 343 | "name": "height_input" 344 | }, 345 | "shape": 7 346 | }, 347 | { 348 | "name": "width", 349 | "type": "INT", 350 | "link": 152, 351 | "widget": { 352 | "name": "width" 353 | } 354 | }, 355 | { 356 | "name": "height", 357 | "type": "INT", 358 | "link": 153, 359 | "widget": { 360 | "name": "height" 361 | } 362 | } 363 | ], 364 | "outputs": [ 365 | { 366 | "name": "IMAGE", 367 | "type": "IMAGE", 368 | "links": [ 369 | 155 370 | ], 371 | "slot_index": 0, 372 | "shape": 3 373 | }, 374 | { 375 | "name": "width", 376 | "type": "INT", 377 | "links": null, 378 | "shape": 3 379 | }, 380 | { 381 | "name": "height", 382 | "type": "INT", 383 | "links": null, 384 | "shape": 3 385 | } 386 | ], 387 | "properties": { 388 | "Node name for S&R": "ImageResizeKJ" 389 | }, 390 | "widgets_values": [ 391 | 720, 392 | 480, 393 | "lanczos", 394 | false, 395 | 16, 396 | 0, 397 | 0, 398 | "disabled" 399 | ] 400 | }, 401 | { 402 | "id": 37, 403 | "type": "ImageResizeKJ", 404 | "pos": { 405 | "0": 593, 406 | "1": 731 407 | }, 408 | "size": { 409 | "0": 315, 410 | "1": 266 411 | }, 412 | "flags": {}, 413 | "order": 5, 414 | "mode": 0, 415 | "inputs": [ 416 | { 417 | "name": "image", 418 | "type": "IMAGE", 419 | "link": 71 420 | }, 421 | { 422 | "name": "get_image_size", 423 | "type": "IMAGE", 424 | "link": null, 425 | "shape": 7 426 | }, 427 | { 428 | "name": "width_input", 429 | "type": "INT", 430 | "link": null, 431 | "widget": { 432 | "name": "width_input" 433 | } 434 | }, 435 | { 436 | "name": "height_input", 437 | "type": "INT", 438 | "link": null, 439 | "widget": { 440 | "name": "height_input" 441 | } 442 | } 443 | ], 444 | "outputs": [ 445 | { 446 | "name": "IMAGE", 447 | "type": "IMAGE", 448 | "links": [ 449 | 142 450 | ], 451 | "slot_index": 0, 452 | "shape": 3 453 | }, 454 | { 455 | "name": "width", 456 | "type": "INT", 457 | "links": [ 458 | 152 459 | ], 460 | "shape": 3, 461 | "slot_index": 1 462 | }, 463 | { 464 | "name": "height", 465 | "type": "INT", 466 | "links": [ 467 | 153 468 | ], 469 | "shape": 3, 470 | "slot_index": 2 471 | } 472 | ], 473 | "properties": { 474 | "Node name for S&R": "ImageResizeKJ" 475 | }, 476 | "widgets_values": [ 477 | 720, 478 | 480, 479 | "lanczos", 480 | false, 481 | 16, 482 | 0, 483 | 0, 484 | "disabled" 485 | ] 486 | }, 487 | { 488 | "id": 60, 489 | "type": "CogVideoDecode", 490 | "pos": { 491 | "0": 1526, 492 | "1": -4 493 | }, 494 | "size": { 495 | "0": 315, 496 | "1": 198 497 | }, 498 | "flags": {}, 499 | "order": 10, 500 | "mode": 0, 501 | "inputs": [ 502 | { 503 | "name": "vae", 504 | "type": "VAE", 505 | "link": 132 506 | }, 507 | { 508 | "name": "samples", 509 | "type": "LATENT", 510 | "link": 148 511 | } 512 | ], 513 | "outputs": [ 514 | { 515 | "name": "images", 516 | "type": "IMAGE", 517 | "links": [ 518 | 134 519 | ] 520 | } 521 | ], 522 | "properties": { 523 | "Node name for S&R": "CogVideoDecode" 524 | }, 525 | "widgets_values": [ 526 | true, 527 | 240, 528 | 360, 529 | 0.2, 530 | 0.2, 531 | true 532 | ] 533 | }, 534 | { 535 | "id": 62, 536 | "type": "CogVideoImageEncode", 537 | "pos": { 538 | "0": 1152, 539 | "1": 706 540 | }, 541 | "size": { 542 | "0": 315, 543 | "1": 122 544 | }, 545 | "flags": {}, 546 | "order": 8, 547 | "mode": 0, 548 | "inputs": [ 549 | { 550 | "name": "vae", 551 | "type": "VAE", 552 | "link": 141 553 | }, 554 | { 555 | "name": "start_image", 556 | "type": "IMAGE", 557 | "link": 142 558 | }, 559 | { 560 | "name": "end_image", 561 | "type": "IMAGE", 562 | "link": 155, 563 | "shape": 7 564 | } 565 | ], 566 | "outputs": [ 567 | { 568 | "name": "samples", 569 | "type": "LATENT", 570 | "links": [ 571 | 147 572 | ] 573 | } 574 | ], 575 | "properties": { 576 | "Node name for S&R": "CogVideoImageEncode" 577 | }, 578 | "widgets_values": [ 579 | false, 580 | 0 581 | ] 582 | }, 583 | { 584 | "id": 44, 585 | "type": "VHS_VideoCombine", 586 | "pos": { 587 | "0": 1884, 588 | "1": -3 589 | }, 590 | "size": [ 591 | 605.3909912109375, 592 | 714.2606608072917 593 | ], 594 | "flags": {}, 595 | "order": 11, 596 | "mode": 0, 597 | "inputs": [ 598 | { 599 | "name": "images", 600 | "type": "IMAGE", 601 | "link": 134 602 | }, 603 | { 604 | "name": "audio", 605 | "type": "AUDIO", 606 | "link": null, 607 | "shape": 7 608 | }, 609 | { 610 | "name": "meta_batch", 611 | "type": "VHS_BatchManager", 612 | "link": null, 613 | "shape": 7 614 | }, 615 | { 616 | "name": "vae", 617 | "type": "VAE", 618 | "link": null, 619 | "shape": 7 620 | } 621 | ], 622 | "outputs": [ 623 | { 624 | "name": "Filenames", 625 | "type": "VHS_FILENAMES", 626 | "links": null, 627 | "shape": 3 628 | } 629 | ], 630 | "properties": { 631 | "Node name for S&R": "VHS_VideoCombine" 632 | }, 633 | "widgets_values": { 634 | "frame_rate": 8, 635 | "loop_count": 0, 636 | "filename_prefix": "CogVideoX-Interpolation", 637 | "format": "video/h264-mp4", 638 | "pix_fmt": "yuv420p", 639 | "crf": 19, 640 | "save_metadata": true, 641 | "pingpong": false, 642 | "save_output": true, 643 | "videopreview": { 644 | "hidden": false, 645 | "paused": false, 646 | "params": { 647 | "filename": "CogVideoX-I2V_00003.mp4", 648 | "subfolder": "", 649 | "type": "temp", 650 | "format": "video/h264-mp4", 651 | "frame_rate": 8 652 | }, 653 | "muted": false 654 | } 655 | } 656 | }, 657 | { 658 | "id": 59, 659 | "type": "DownloadAndLoadCogVideoModel", 660 | "pos": { 661 | "0": 622, 662 | "1": -25 663 | }, 664 | "size": [ 665 | 347.24594407027485, 666 | 218 667 | ], 668 | "flags": {}, 669 | "order": 3, 670 | "mode": 0, 671 | "inputs": [ 672 | { 673 | "name": "block_edit", 674 | "type": "TRANSFORMERBLOCKS", 675 | "link": null, 676 | "shape": 7 677 | }, 678 | { 679 | "name": "lora", 680 | "type": "COGLORA", 681 | "link": null, 682 | "shape": 7 683 | }, 684 | { 685 | "name": "compile_args", 686 | "type": "COMPILEARGS", 687 | "link": null, 688 | "shape": 7 689 | } 690 | ], 691 | "outputs": [ 692 | { 693 | "name": "model", 694 | "type": "COGVIDEOMODEL", 695 | "links": [ 696 | 144 697 | ] 698 | }, 699 | { 700 | "name": "vae", 701 | "type": "VAE", 702 | "links": [ 703 | 132, 704 | 141 705 | ], 706 | "slot_index": 1 707 | } 708 | ], 709 | "properties": { 710 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 711 | }, 712 | "widgets_values": [ 713 | "feizhengcong/CogvideoX-Interpolation", 714 | "bf16", 715 | "disabled", 716 | false, 717 | "sdpa", 718 | "main_device" 719 | ] 720 | } 721 | ], 722 | "links": [ 723 | [ 724 | 54, 725 | 20, 726 | 0, 727 | 30, 728 | 0, 729 | "CLIP" 730 | ], 731 | [ 732 | 71, 733 | 36, 734 | 0, 735 | 37, 736 | 0, 737 | "IMAGE" 738 | ], 739 | [ 740 | 132, 741 | 59, 742 | 1, 743 | 60, 744 | 0, 745 | "VAE" 746 | ], 747 | [ 748 | 134, 749 | 60, 750 | 0, 751 | 44, 752 | 0, 753 | "IMAGE" 754 | ], 755 | [ 756 | 141, 757 | 59, 758 | 1, 759 | 62, 760 | 0, 761 | "VAE" 762 | ], 763 | [ 764 | 142, 765 | 37, 766 | 0, 767 | 62, 768 | 1, 769 | "IMAGE" 770 | ], 771 | [ 772 | 144, 773 | 59, 774 | 0, 775 | 63, 776 | 0, 777 | "COGVIDEOMODEL" 778 | ], 779 | [ 780 | 145, 781 | 30, 782 | 0, 783 | 63, 784 | 1, 785 | "CONDITIONING" 786 | ], 787 | [ 788 | 146, 789 | 31, 790 | 0, 791 | 63, 792 | 2, 793 | "CONDITIONING" 794 | ], 795 | [ 796 | 147, 797 | 62, 798 | 0, 799 | 63, 800 | 4, 801 | "LATENT" 802 | ], 803 | [ 804 | 148, 805 | 63, 806 | 0, 807 | 60, 808 | 1, 809 | "LATENT" 810 | ], 811 | [ 812 | 149, 813 | 30, 814 | 1, 815 | 31, 816 | 0, 817 | "CLIP" 818 | ], 819 | [ 820 | 151, 821 | 64, 822 | 0, 823 | 65, 824 | 0, 825 | "IMAGE" 826 | ], 827 | [ 828 | 152, 829 | 37, 830 | 1, 831 | 65, 832 | 4, 833 | "INT" 834 | ], 835 | [ 836 | 153, 837 | 37, 838 | 2, 839 | 65, 840 | 5, 841 | "INT" 842 | ], 843 | [ 844 | 155, 845 | 65, 846 | 0, 847 | 62, 848 | 2, 849 | "IMAGE" 850 | ] 851 | ], 852 | "groups": [], 853 | "config": {}, 854 | "extra": { 855 | "ds": { 856 | "scale": 0.7627768444387061, 857 | "offset": [ 858 | 630.1733472923837, 859 | 148.14641794691272 860 | ] 861 | } 862 | }, 863 | "version": 0.4 864 | } -------------------------------------------------------------------------------- /example_workflows/cogvideox_1_5_5b_I2V_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 64, 3 | "last_link_id": 149, 4 | "nodes": [ 5 | { 6 | "id": 63, 7 | "type": "CogVideoSampler", 8 | "pos": { 9 | "0": 1142, 10 | "1": 74 11 | }, 12 | "size": { 13 | "0": 330, 14 | "1": 574 15 | }, 16 | "flags": {}, 17 | "order": 7, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "model", 22 | "type": "COGVIDEOMODEL", 23 | "link": 144 24 | }, 25 | { 26 | "name": "positive", 27 | "type": "CONDITIONING", 28 | "link": 145 29 | }, 30 | { 31 | "name": "negative", 32 | "type": "CONDITIONING", 33 | "link": 146 34 | }, 35 | { 36 | "name": "samples", 37 | "type": "LATENT", 38 | "link": null, 39 | "shape": 7 40 | }, 41 | { 42 | "name": "image_cond_latents", 43 | "type": "LATENT", 44 | "link": 147, 45 | "shape": 7 46 | }, 47 | { 48 | "name": "context_options", 49 | "type": "COGCONTEXT", 50 | "link": null, 51 | "shape": 7 52 | }, 53 | { 54 | "name": "controlnet", 55 | "type": "COGVIDECONTROLNET", 56 | "link": null, 57 | "shape": 7 58 | }, 59 | { 60 | "name": "tora_trajectory", 61 | "type": "TORAFEATURES", 62 | "link": null, 63 | "shape": 7 64 | }, 65 | { 66 | "name": "fastercache", 67 | "type": "FASTERCACHEARGS", 68 | "link": null, 69 | "shape": 7 70 | } 71 | ], 72 | "outputs": [ 73 | { 74 | "name": "samples", 75 | "type": "LATENT", 76 | "links": [ 77 | 148 78 | ] 79 | } 80 | ], 81 | "properties": { 82 | "Node name for S&R": "CogVideoSampler" 83 | }, 84 | "widgets_values": [ 85 | 49, 86 | 25, 87 | 6, 88 | 0, 89 | "fixed", 90 | "CogVideoXDDIM", 91 | 1 92 | ] 93 | }, 94 | { 95 | "id": 62, 96 | "type": "CogVideoImageEncode", 97 | "pos": { 98 | "0": 1149, 99 | "1": 711 100 | }, 101 | "size": { 102 | "0": 315, 103 | "1": 122 104 | }, 105 | "flags": {}, 106 | "order": 5, 107 | "mode": 0, 108 | "inputs": [ 109 | { 110 | "name": "vae", 111 | "type": "VAE", 112 | "link": 141 113 | }, 114 | { 115 | "name": "start_image", 116 | "type": "IMAGE", 117 | "link": 142 118 | }, 119 | { 120 | "name": "end_image", 121 | "type": "IMAGE", 122 | "link": null, 123 | "shape": 7 124 | } 125 | ], 126 | "outputs": [ 127 | { 128 | "name": "samples", 129 | "type": "LATENT", 130 | "links": [ 131 | 147 132 | ] 133 | } 134 | ], 135 | "properties": { 136 | "Node name for S&R": "CogVideoImageEncode" 137 | }, 138 | "widgets_values": [ 139 | false, 140 | 0 141 | ] 142 | }, 143 | { 144 | "id": 30, 145 | "type": "CogVideoTextEncode", 146 | "pos": { 147 | "0": 493, 148 | "1": 303 149 | }, 150 | "size": { 151 | "0": 471.90142822265625, 152 | "1": 168.08047485351562 153 | }, 154 | "flags": {}, 155 | "order": 4, 156 | "mode": 0, 157 | "inputs": [ 158 | { 159 | "name": "clip", 160 | "type": "CLIP", 161 | "link": 54 162 | } 163 | ], 164 | "outputs": [ 165 | { 166 | "name": "conditioning", 167 | "type": "CONDITIONING", 168 | "links": [ 169 | 145 170 | ], 171 | "slot_index": 0, 172 | "shape": 3 173 | }, 174 | { 175 | "name": "clip", 176 | "type": "CLIP", 177 | "links": [ 178 | 149 179 | ], 180 | "slot_index": 1 181 | } 182 | ], 183 | "properties": { 184 | "Node name for S&R": "CogVideoTextEncode" 185 | }, 186 | "widgets_values": [ 187 | "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees", 188 | 1, 189 | false 190 | ] 191 | }, 192 | { 193 | "id": 36, 194 | "type": "LoadImage", 195 | "pos": { 196 | "0": 335, 197 | "1": 731 198 | }, 199 | "size": { 200 | "0": 402.06353759765625, 201 | "1": 396.6225891113281 202 | }, 203 | "flags": {}, 204 | "order": 0, 205 | "mode": 0, 206 | "inputs": [], 207 | "outputs": [ 208 | { 209 | "name": "IMAGE", 210 | "type": "IMAGE", 211 | "links": [ 212 | 71 213 | ], 214 | "slot_index": 0, 215 | "shape": 3 216 | }, 217 | { 218 | "name": "MASK", 219 | "type": "MASK", 220 | "links": null, 221 | "shape": 3 222 | } 223 | ], 224 | "properties": { 225 | "Node name for S&R": "LoadImage" 226 | }, 227 | "widgets_values": [ 228 | "sd3stag.png", 229 | "image" 230 | ] 231 | }, 232 | { 233 | "id": 20, 234 | "type": "CLIPLoader", 235 | "pos": { 236 | "0": -2, 237 | "1": 304 238 | }, 239 | "size": { 240 | "0": 451.30548095703125, 241 | "1": 82 242 | }, 243 | "flags": {}, 244 | "order": 1, 245 | "mode": 0, 246 | "inputs": [], 247 | "outputs": [ 248 | { 249 | "name": "CLIP", 250 | "type": "CLIP", 251 | "links": [ 252 | 54 253 | ], 254 | "slot_index": 0, 255 | "shape": 3 256 | } 257 | ], 258 | "properties": { 259 | "Node name for S&R": "CLIPLoader" 260 | }, 261 | "widgets_values": [ 262 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 263 | "sd3" 264 | ] 265 | }, 266 | { 267 | "id": 60, 268 | "type": "CogVideoDecode", 269 | "pos": { 270 | "0": 1523, 271 | "1": -6 272 | }, 273 | "size": { 274 | "0": 315, 275 | "1": 198 276 | }, 277 | "flags": {}, 278 | "order": 8, 279 | "mode": 0, 280 | "inputs": [ 281 | { 282 | "name": "vae", 283 | "type": "VAE", 284 | "link": 132 285 | }, 286 | { 287 | "name": "samples", 288 | "type": "LATENT", 289 | "link": 148 290 | } 291 | ], 292 | "outputs": [ 293 | { 294 | "name": "images", 295 | "type": "IMAGE", 296 | "links": [ 297 | 134 298 | ] 299 | } 300 | ], 301 | "properties": { 302 | "Node name for S&R": "CogVideoDecode" 303 | }, 304 | "widgets_values": [ 305 | true, 306 | 240, 307 | 360, 308 | 0.2, 309 | 0.2, 310 | true 311 | ] 312 | }, 313 | { 314 | "id": 37, 315 | "type": "ImageResizeKJ", 316 | "pos": { 317 | "0": 784, 318 | "1": 731 319 | }, 320 | "size": { 321 | "0": 315, 322 | "1": 266 323 | }, 324 | "flags": {}, 325 | "order": 3, 326 | "mode": 0, 327 | "inputs": [ 328 | { 329 | "name": "image", 330 | "type": "IMAGE", 331 | "link": 71 332 | }, 333 | { 334 | "name": "get_image_size", 335 | "type": "IMAGE", 336 | "link": null, 337 | "shape": 7 338 | }, 339 | { 340 | "name": "width_input", 341 | "type": "INT", 342 | "link": null, 343 | "widget": { 344 | "name": "width_input" 345 | } 346 | }, 347 | { 348 | "name": "height_input", 349 | "type": "INT", 350 | "link": null, 351 | "widget": { 352 | "name": "height_input" 353 | } 354 | } 355 | ], 356 | "outputs": [ 357 | { 358 | "name": "IMAGE", 359 | "type": "IMAGE", 360 | "links": [ 361 | 142 362 | ], 363 | "slot_index": 0, 364 | "shape": 3 365 | }, 366 | { 367 | "name": "width", 368 | "type": "INT", 369 | "links": null, 370 | "shape": 3 371 | }, 372 | { 373 | "name": "height", 374 | "type": "INT", 375 | "links": null, 376 | "shape": 3 377 | } 378 | ], 379 | "properties": { 380 | "Node name for S&R": "ImageResizeKJ" 381 | }, 382 | "widgets_values": [ 383 | 1360, 384 | 768, 385 | "lanczos", 386 | false, 387 | 16, 388 | 0, 389 | 0, 390 | "disabled" 391 | ] 392 | }, 393 | { 394 | "id": 31, 395 | "type": "CogVideoTextEncode", 396 | "pos": { 397 | "0": 497, 398 | "1": 520 399 | }, 400 | "size": { 401 | "0": 463.01251220703125, 402 | "1": 144 403 | }, 404 | "flags": {}, 405 | "order": 6, 406 | "mode": 0, 407 | "inputs": [ 408 | { 409 | "name": "clip", 410 | "type": "CLIP", 411 | "link": 149 412 | } 413 | ], 414 | "outputs": [ 415 | { 416 | "name": "conditioning", 417 | "type": "CONDITIONING", 418 | "links": [ 419 | 146 420 | ], 421 | "slot_index": 0, 422 | "shape": 3 423 | }, 424 | { 425 | "name": "clip", 426 | "type": "CLIP", 427 | "links": null 428 | } 429 | ], 430 | "properties": { 431 | "Node name for S&R": "CogVideoTextEncode" 432 | }, 433 | "widgets_values": [ 434 | "", 435 | 1, 436 | true 437 | ] 438 | }, 439 | { 440 | "id": 59, 441 | "type": "DownloadAndLoadCogVideoModel", 442 | "pos": { 443 | "0": 622, 444 | "1": -25 445 | }, 446 | "size": { 447 | "0": 315, 448 | "1": 218 449 | }, 450 | "flags": {}, 451 | "order": 2, 452 | "mode": 0, 453 | "inputs": [ 454 | { 455 | "name": "block_edit", 456 | "type": "TRANSFORMERBLOCKS", 457 | "link": null, 458 | "shape": 7 459 | }, 460 | { 461 | "name": "lora", 462 | "type": "COGLORA", 463 | "link": null, 464 | "shape": 7 465 | }, 466 | { 467 | "name": "compile_args", 468 | "type": "COMPILEARGS", 469 | "link": null, 470 | "shape": 7 471 | } 472 | ], 473 | "outputs": [ 474 | { 475 | "name": "model", 476 | "type": "COGVIDEOMODEL", 477 | "links": [ 478 | 144 479 | ] 480 | }, 481 | { 482 | "name": "vae", 483 | "type": "VAE", 484 | "links": [ 485 | 132, 486 | 141 487 | ], 488 | "slot_index": 1 489 | } 490 | ], 491 | "properties": { 492 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 493 | }, 494 | "widgets_values": [ 495 | "kijai/CogVideoX-5b-1.5-I2V", 496 | "bf16", 497 | "disabled", 498 | false, 499 | "sdpa", 500 | "main_device" 501 | ] 502 | }, 503 | { 504 | "id": 44, 505 | "type": "VHS_VideoCombine", 506 | "pos": { 507 | "0": 1884, 508 | "1": -6 509 | }, 510 | "size": [ 511 | 605.3909912109375, 512 | 310 513 | ], 514 | "flags": {}, 515 | "order": 9, 516 | "mode": 0, 517 | "inputs": [ 518 | { 519 | "name": "images", 520 | "type": "IMAGE", 521 | "link": 134 522 | }, 523 | { 524 | "name": "audio", 525 | "type": "AUDIO", 526 | "link": null, 527 | "shape": 7 528 | }, 529 | { 530 | "name": "meta_batch", 531 | "type": "VHS_BatchManager", 532 | "link": null, 533 | "shape": 7 534 | }, 535 | { 536 | "name": "vae", 537 | "type": "VAE", 538 | "link": null, 539 | "shape": 7 540 | } 541 | ], 542 | "outputs": [ 543 | { 544 | "name": "Filenames", 545 | "type": "VHS_FILENAMES", 546 | "links": null, 547 | "shape": 3 548 | } 549 | ], 550 | "properties": { 551 | "Node name for S&R": "VHS_VideoCombine" 552 | }, 553 | "widgets_values": { 554 | "frame_rate": 16, 555 | "loop_count": 0, 556 | "filename_prefix": "CogVideoX_1_5_I2V", 557 | "format": "video/h264-mp4", 558 | "pix_fmt": "yuv420p", 559 | "crf": 19, 560 | "save_metadata": true, 561 | "pingpong": false, 562 | "save_output": true, 563 | "videopreview": { 564 | "hidden": false, 565 | "paused": false, 566 | "params": { 567 | "filename": "CogVideoX-I2V_00004.mp4", 568 | "subfolder": "", 569 | "type": "temp", 570 | "format": "video/h264-mp4", 571 | "frame_rate": 8 572 | }, 573 | "muted": false 574 | } 575 | } 576 | } 577 | ], 578 | "links": [ 579 | [ 580 | 54, 581 | 20, 582 | 0, 583 | 30, 584 | 0, 585 | "CLIP" 586 | ], 587 | [ 588 | 71, 589 | 36, 590 | 0, 591 | 37, 592 | 0, 593 | "IMAGE" 594 | ], 595 | [ 596 | 132, 597 | 59, 598 | 1, 599 | 60, 600 | 0, 601 | "VAE" 602 | ], 603 | [ 604 | 134, 605 | 60, 606 | 0, 607 | 44, 608 | 0, 609 | "IMAGE" 610 | ], 611 | [ 612 | 141, 613 | 59, 614 | 1, 615 | 62, 616 | 0, 617 | "VAE" 618 | ], 619 | [ 620 | 142, 621 | 37, 622 | 0, 623 | 62, 624 | 1, 625 | "IMAGE" 626 | ], 627 | [ 628 | 144, 629 | 59, 630 | 0, 631 | 63, 632 | 0, 633 | "COGVIDEOMODEL" 634 | ], 635 | [ 636 | 145, 637 | 30, 638 | 0, 639 | 63, 640 | 1, 641 | "CONDITIONING" 642 | ], 643 | [ 644 | 146, 645 | 31, 646 | 0, 647 | 63, 648 | 2, 649 | "CONDITIONING" 650 | ], 651 | [ 652 | 147, 653 | 62, 654 | 0, 655 | 63, 656 | 4, 657 | "LATENT" 658 | ], 659 | [ 660 | 148, 661 | 63, 662 | 0, 663 | 60, 664 | 1, 665 | "LATENT" 666 | ], 667 | [ 668 | 149, 669 | 30, 670 | 1, 671 | 31, 672 | 0, 673 | "CLIP" 674 | ] 675 | ], 676 | "groups": [], 677 | "config": {}, 678 | "extra": { 679 | "ds": { 680 | "scale": 0.7627768444387097, 681 | "offset": [ 682 | 716.7143770104391, 683 | 291.75859557289965 684 | ] 685 | } 686 | }, 687 | "version": 0.4 688 | } -------------------------------------------------------------------------------- /example_workflows/cogvideox_Fun_I2V_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 51, 3 | "last_link_id": 123, 4 | "nodes": [ 5 | { 6 | "id": 48, 7 | "type": "CogVideoSampler", 8 | "pos": { 9 | "0": 1200, 10 | "1": 124 11 | }, 12 | "size": [ 13 | 330, 14 | 574 15 | ], 16 | "flags": {}, 17 | "order": 7, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "model", 22 | "type": "COGVIDEOMODEL", 23 | "link": 114 24 | }, 25 | { 26 | "name": "positive", 27 | "type": "CONDITIONING", 28 | "link": 116 29 | }, 30 | { 31 | "name": "negative", 32 | "type": "CONDITIONING", 33 | "link": 117 34 | }, 35 | { 36 | "name": "samples", 37 | "type": "LATENT", 38 | "link": null, 39 | "shape": 7 40 | }, 41 | { 42 | "name": "image_cond_latents", 43 | "type": "LATENT", 44 | "link": 120, 45 | "shape": 7 46 | }, 47 | { 48 | "name": "context_options", 49 | "type": "COGCONTEXT", 50 | "link": null, 51 | "shape": 7 52 | }, 53 | { 54 | "name": "controlnet", 55 | "type": "COGVIDECONTROLNET", 56 | "link": null, 57 | "shape": 7 58 | }, 59 | { 60 | "name": "tora_trajectory", 61 | "type": "TORAFEATURES", 62 | "link": null, 63 | "shape": 7 64 | }, 65 | { 66 | "name": "fastercache", 67 | "type": "FASTERCACHEARGS", 68 | "link": null, 69 | "shape": 7 70 | } 71 | ], 72 | "outputs": [ 73 | { 74 | "name": "samples", 75 | "type": "LATENT", 76 | "links": [ 77 | 123 78 | ], 79 | "slot_index": 0 80 | } 81 | ], 82 | "properties": { 83 | "Node name for S&R": "CogVideoSampler" 84 | }, 85 | "widgets_values": [ 86 | 49, 87 | 25, 88 | 6, 89 | 458091243358272, 90 | "randomize", 91 | "CogVideoXDDIM", 92 | 1 93 | ] 94 | }, 95 | { 96 | "id": 30, 97 | "type": "CogVideoTextEncode", 98 | "pos": { 99 | "0": 490, 100 | "1": 146 101 | }, 102 | "size": { 103 | "0": 471.90142822265625, 104 | "1": 168.08047485351562 105 | }, 106 | "flags": {}, 107 | "order": 3, 108 | "mode": 0, 109 | "inputs": [ 110 | { 111 | "name": "clip", 112 | "type": "CLIP", 113 | "link": 54 114 | } 115 | ], 116 | "outputs": [ 117 | { 118 | "name": "conditioning", 119 | "type": "CONDITIONING", 120 | "links": [ 121 | 116 122 | ], 123 | "slot_index": 0, 124 | "shape": 3 125 | }, 126 | { 127 | "name": "clip", 128 | "type": "CLIP", 129 | "links": [ 130 | 110 131 | ], 132 | "slot_index": 1 133 | } 134 | ], 135 | "properties": { 136 | "Node name for S&R": "CogVideoTextEncode" 137 | }, 138 | "widgets_values": [ 139 | "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.", 140 | 1, 141 | false 142 | ] 143 | }, 144 | { 145 | "id": 31, 146 | "type": "CogVideoTextEncode", 147 | "pos": { 148 | "0": 497, 149 | "1": 365 150 | }, 151 | "size": { 152 | "0": 463.01251220703125, 153 | "1": 144 154 | }, 155 | "flags": {}, 156 | "order": 5, 157 | "mode": 0, 158 | "inputs": [ 159 | { 160 | "name": "clip", 161 | "type": "CLIP", 162 | "link": 110 163 | } 164 | ], 165 | "outputs": [ 166 | { 167 | "name": "conditioning", 168 | "type": "CONDITIONING", 169 | "links": [ 170 | 117 171 | ], 172 | "slot_index": 0, 173 | "shape": 3 174 | }, 175 | { 176 | "name": "clip", 177 | "type": "CLIP", 178 | "links": null 179 | } 180 | ], 181 | "properties": { 182 | "Node name for S&R": "CogVideoTextEncode" 183 | }, 184 | "widgets_values": [ 185 | "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", 186 | 1, 187 | true 188 | ] 189 | }, 190 | { 191 | "id": 20, 192 | "type": "CLIPLoader", 193 | "pos": { 194 | "0": -7, 195 | "1": -37 196 | }, 197 | "size": { 198 | "0": 451.30548095703125, 199 | "1": 82 200 | }, 201 | "flags": {}, 202 | "order": 0, 203 | "mode": 0, 204 | "inputs": [], 205 | "outputs": [ 206 | { 207 | "name": "CLIP", 208 | "type": "CLIP", 209 | "links": [ 210 | 54 211 | ], 212 | "slot_index": 0, 213 | "shape": 3 214 | } 215 | ], 216 | "properties": { 217 | "Node name for S&R": "CLIPLoader" 218 | }, 219 | "widgets_values": [ 220 | "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", 221 | "sd3" 222 | ] 223 | }, 224 | { 225 | "id": 50, 226 | "type": "CogVideoImageEncodeFunInP", 227 | "pos": { 228 | "0": 865, 229 | "1": 567 230 | }, 231 | "size": [ 232 | 253.60000610351562, 233 | 146 234 | ], 235 | "flags": {}, 236 | "order": 6, 237 | "mode": 0, 238 | "inputs": [ 239 | { 240 | "name": "vae", 241 | "type": "VAE", 242 | "link": 119 243 | }, 244 | { 245 | "name": "start_image", 246 | "type": "IMAGE", 247 | "link": 118 248 | }, 249 | { 250 | "name": "end_image", 251 | "type": "IMAGE", 252 | "link": null, 253 | "shape": 7 254 | } 255 | ], 256 | "outputs": [ 257 | { 258 | "name": "image_cond_latents", 259 | "type": "LATENT", 260 | "links": [ 261 | 120 262 | ], 263 | "slot_index": 0 264 | } 265 | ], 266 | "properties": { 267 | "Node name for S&R": "CogVideoImageEncodeFunInP" 268 | }, 269 | "widgets_values": [ 270 | 49, 271 | true, 272 | 0 273 | ] 274 | }, 275 | { 276 | "id": 37, 277 | "type": "ImageResizeKJ", 278 | "pos": { 279 | "0": 499, 280 | "1": 587 281 | }, 282 | "size": { 283 | "0": 315, 284 | "1": 266 285 | }, 286 | "flags": {}, 287 | "order": 4, 288 | "mode": 0, 289 | "inputs": [ 290 | { 291 | "name": "image", 292 | "type": "IMAGE", 293 | "link": 71 294 | }, 295 | { 296 | "name": "get_image_size", 297 | "type": "IMAGE", 298 | "link": null, 299 | "shape": 7 300 | }, 301 | { 302 | "name": "width_input", 303 | "type": "INT", 304 | "link": null, 305 | "widget": { 306 | "name": "width_input" 307 | } 308 | }, 309 | { 310 | "name": "height_input", 311 | "type": "INT", 312 | "link": null, 313 | "widget": { 314 | "name": "height_input" 315 | } 316 | } 317 | ], 318 | "outputs": [ 319 | { 320 | "name": "IMAGE", 321 | "type": "IMAGE", 322 | "links": [ 323 | 118 324 | ], 325 | "slot_index": 0, 326 | "shape": 3 327 | }, 328 | { 329 | "name": "width", 330 | "type": "INT", 331 | "links": null, 332 | "shape": 3 333 | }, 334 | { 335 | "name": "height", 336 | "type": "INT", 337 | "links": null, 338 | "shape": 3 339 | } 340 | ], 341 | "properties": { 342 | "Node name for S&R": "ImageResizeKJ" 343 | }, 344 | "widgets_values": [ 345 | 720, 346 | 480, 347 | "lanczos", 348 | false, 349 | 2, 350 | 0, 351 | 0, 352 | "disabled" 353 | ] 354 | }, 355 | { 356 | "id": 36, 357 | "type": "LoadImage", 358 | "pos": { 359 | "0": 43, 360 | "1": 587 361 | }, 362 | "size": [ 363 | 405.2986131072541, 364 | 477.48971409949377 365 | ], 366 | "flags": {}, 367 | "order": 1, 368 | "mode": 0, 369 | "inputs": [], 370 | "outputs": [ 371 | { 372 | "name": "IMAGE", 373 | "type": "IMAGE", 374 | "links": [ 375 | 71 376 | ], 377 | "slot_index": 0, 378 | "shape": 3 379 | }, 380 | { 381 | "name": "MASK", 382 | "type": "MASK", 383 | "links": null, 384 | "shape": 3 385 | } 386 | ], 387 | "properties": { 388 | "Node name for S&R": "LoadImage" 389 | }, 390 | "widgets_values": [ 391 | "6e1a7befce6daa63fc01cb66c1a22ed0.jpg", 392 | "image" 393 | ] 394 | }, 395 | { 396 | "id": 51, 397 | "type": "CogVideoDecode", 398 | "pos": { 399 | "0": 1219, 400 | "1": -134 401 | }, 402 | "size": { 403 | "0": 315, 404 | "1": 198 405 | }, 406 | "flags": {}, 407 | "order": 8, 408 | "mode": 0, 409 | "inputs": [ 410 | { 411 | "name": "vae", 412 | "type": "VAE", 413 | "link": 122 414 | }, 415 | { 416 | "name": "samples", 417 | "type": "LATENT", 418 | "link": 123 419 | } 420 | ], 421 | "outputs": [ 422 | { 423 | "name": "images", 424 | "type": "IMAGE", 425 | "links": [ 426 | 121 427 | ] 428 | } 429 | ], 430 | "properties": { 431 | "Node name for S&R": "CogVideoDecode" 432 | }, 433 | "widgets_values": [ 434 | true, 435 | 240, 436 | 360, 437 | 0.2, 438 | 0.2, 439 | true 440 | ] 441 | }, 442 | { 443 | "id": 44, 444 | "type": "VHS_VideoCombine", 445 | "pos": { 446 | "0": 1602, 447 | "1": -131 448 | }, 449 | "size": [ 450 | 767.7372279260157, 451 | 822.491455078125 452 | ], 453 | "flags": {}, 454 | "order": 9, 455 | "mode": 0, 456 | "inputs": [ 457 | { 458 | "name": "images", 459 | "type": "IMAGE", 460 | "link": 121 461 | }, 462 | { 463 | "name": "audio", 464 | "type": "AUDIO", 465 | "link": null, 466 | "shape": 7 467 | }, 468 | { 469 | "name": "meta_batch", 470 | "type": "VHS_BatchManager", 471 | "link": null, 472 | "shape": 7 473 | }, 474 | { 475 | "name": "vae", 476 | "type": "VAE", 477 | "link": null, 478 | "shape": 7 479 | } 480 | ], 481 | "outputs": [ 482 | { 483 | "name": "Filenames", 484 | "type": "VHS_FILENAMES", 485 | "links": null, 486 | "shape": 3 487 | } 488 | ], 489 | "properties": { 490 | "Node name for S&R": "VHS_VideoCombine" 491 | }, 492 | "widgets_values": { 493 | "frame_rate": 8, 494 | "loop_count": 0, 495 | "filename_prefix": "CogVideoX_Fun", 496 | "format": "video/h264-mp4", 497 | "pix_fmt": "yuv420p", 498 | "crf": 19, 499 | "save_metadata": true, 500 | "pingpong": false, 501 | "save_output": true, 502 | "videopreview": { 503 | "hidden": false, 504 | "paused": false, 505 | "params": { 506 | "filename": "CogVideoX_Fun_00002.mp4", 507 | "subfolder": "", 508 | "type": "temp", 509 | "format": "video/h264-mp4", 510 | "frame_rate": 8 511 | }, 512 | "muted": false 513 | } 514 | } 515 | }, 516 | { 517 | "id": 49, 518 | "type": "DownloadAndLoadCogVideoModel", 519 | "pos": { 520 | "0": 491, 521 | "1": -167 522 | }, 523 | "size": { 524 | "0": 362.1656799316406, 525 | "1": 218 526 | }, 527 | "flags": {}, 528 | "order": 2, 529 | "mode": 0, 530 | "inputs": [ 531 | { 532 | "name": "block_edit", 533 | "type": "TRANSFORMERBLOCKS", 534 | "link": null, 535 | "shape": 7 536 | }, 537 | { 538 | "name": "lora", 539 | "type": "COGLORA", 540 | "link": null, 541 | "shape": 7 542 | }, 543 | { 544 | "name": "compile_args", 545 | "type": "COMPILEARGS", 546 | "link": null, 547 | "shape": 7 548 | } 549 | ], 550 | "outputs": [ 551 | { 552 | "name": "model", 553 | "type": "COGVIDEOMODEL", 554 | "links": [ 555 | 114 556 | ] 557 | }, 558 | { 559 | "name": "vae", 560 | "type": "VAE", 561 | "links": [ 562 | 119, 563 | 122 564 | ], 565 | "slot_index": 1 566 | } 567 | ], 568 | "properties": { 569 | "Node name for S&R": "DownloadAndLoadCogVideoModel" 570 | }, 571 | "widgets_values": [ 572 | "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", 573 | "bf16", 574 | "disabled", 575 | false, 576 | "sdpa", 577 | "main_device" 578 | ] 579 | } 580 | ], 581 | "links": [ 582 | [ 583 | 54, 584 | 20, 585 | 0, 586 | 30, 587 | 0, 588 | "CLIP" 589 | ], 590 | [ 591 | 71, 592 | 36, 593 | 0, 594 | 37, 595 | 0, 596 | "IMAGE" 597 | ], 598 | [ 599 | 110, 600 | 30, 601 | 1, 602 | 31, 603 | 0, 604 | "CLIP" 605 | ], 606 | [ 607 | 114, 608 | 49, 609 | 0, 610 | 48, 611 | 0, 612 | "COGVIDEOMODEL" 613 | ], 614 | [ 615 | 116, 616 | 30, 617 | 0, 618 | 48, 619 | 1, 620 | "CONDITIONING" 621 | ], 622 | [ 623 | 117, 624 | 31, 625 | 0, 626 | 48, 627 | 2, 628 | "CONDITIONING" 629 | ], 630 | [ 631 | 118, 632 | 37, 633 | 0, 634 | 50, 635 | 1, 636 | "IMAGE" 637 | ], 638 | [ 639 | 119, 640 | 49, 641 | 1, 642 | 50, 643 | 0, 644 | "VAE" 645 | ], 646 | [ 647 | 120, 648 | 50, 649 | 0, 650 | 48, 651 | 4, 652 | "LATENT" 653 | ], 654 | [ 655 | 121, 656 | 51, 657 | 0, 658 | 44, 659 | 0, 660 | "IMAGE" 661 | ], 662 | [ 663 | 122, 664 | 49, 665 | 1, 666 | 51, 667 | 0, 668 | "VAE" 669 | ], 670 | [ 671 | 123, 672 | 48, 673 | 0, 674 | 51, 675 | 1, 676 | "LATENT" 677 | ] 678 | ], 679 | "groups": [], 680 | "config": {}, 681 | "extra": { 682 | "ds": { 683 | "scale": 0.693433494944278, 684 | "offset": [ 685 | 416.0091223165226, 686 | 378.00843746369645 687 | ] 688 | } 689 | }, 690 | "version": 0.4 691 | } -------------------------------------------------------------------------------- /example_workflows/noise_warp_example_input_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kijai/ComfyUI-CogVideoXWrapper/dbc63f622dd095391335612d0c7d7bbff8745cc8/example_workflows/noise_warp_example_input_video.mp4 -------------------------------------------------------------------------------- /fp8_optimization.py: -------------------------------------------------------------------------------- 1 | #based on ComfyUI's and MinusZoneAI's fp8_linear optimization 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | def fp8_linear_forward(cls, original_dtype, input): 7 | weight_dtype = cls.weight.dtype 8 | if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: 9 | if len(input.shape) == 3: 10 | if weight_dtype == torch.float8_e4m3fn: 11 | inn = input.reshape(-1, input.shape[2]).to(torch.float8_e5m2) 12 | else: 13 | inn = input.reshape(-1, input.shape[2]).to(torch.float8_e4m3fn) 14 | w = cls.weight.t() 15 | 16 | scale_weight = torch.ones((1), device=input.device, dtype=torch.float32) 17 | scale_input = scale_weight 18 | 19 | bias = cls.bias.to(original_dtype) if cls.bias is not None else None 20 | out_dtype = original_dtype 21 | 22 | if bias is not None: 23 | o = torch._scaled_mm(inn, w, out_dtype=out_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight) 24 | else: 25 | o = torch._scaled_mm(inn, w, out_dtype=out_dtype, scale_a=scale_input, scale_b=scale_weight) 26 | 27 | if isinstance(o, tuple): 28 | o = o[0] 29 | 30 | return o.reshape((-1, input.shape[1], cls.weight.shape[0])) 31 | else: 32 | cls.to(original_dtype) 33 | out = cls.original_forward(input.to(original_dtype)) 34 | cls.to(original_dtype) 35 | return out 36 | else: 37 | return cls.original_forward(input) 38 | 39 | def convert_fp8_linear(module, original_dtype, params_to_keep={}): 40 | setattr(module, "fp8_matmul_enabled", True) 41 | 42 | for name, module in module.named_modules(): 43 | if not any(keyword in name for keyword in params_to_keep): 44 | if isinstance(module, nn.Linear): 45 | original_forward = module.forward 46 | setattr(module, "original_forward", original_forward) 47 | setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input)) 48 | -------------------------------------------------------------------------------- /lora_utils.py: -------------------------------------------------------------------------------- 1 | # LoRA network module 2 | # reference: 3 | # https://github.com/microsoft/LoRA/blob/main/loralib/layers.py 4 | # https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py 5 | # https://github.com/bmaltais/kohya_ss 6 | 7 | import hashlib 8 | import math 9 | import os 10 | from collections import defaultdict 11 | from io import BytesIO 12 | from typing import List, Optional, Type, Union 13 | 14 | import safetensors.torch 15 | import torch 16 | import torch.utils.checkpoint 17 | from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear 18 | from safetensors.torch import load_file 19 | from transformers import T5EncoderModel 20 | 21 | 22 | class LoRAModule(torch.nn.Module): 23 | """ 24 | replaces forward method of the original Linear, instead of replacing the original Linear module. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | lora_name, 30 | org_module: torch.nn.Module, 31 | multiplier=1.0, 32 | lora_dim=4, 33 | alpha=1, 34 | dropout=None, 35 | rank_dropout=None, 36 | module_dropout=None, 37 | ): 38 | """if alpha == 0 or None, alpha is rank (no scaling).""" 39 | super().__init__() 40 | self.lora_name = lora_name 41 | 42 | if org_module.__class__.__name__ == "Conv2d": 43 | in_dim = org_module.in_channels 44 | out_dim = org_module.out_channels 45 | else: 46 | in_dim = org_module.in_features 47 | out_dim = org_module.out_features 48 | 49 | self.lora_dim = lora_dim 50 | if org_module.__class__.__name__ == "Conv2d": 51 | kernel_size = org_module.kernel_size 52 | stride = org_module.stride 53 | padding = org_module.padding 54 | self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False) 55 | self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False) 56 | else: 57 | self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False) 58 | self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False) 59 | 60 | if type(alpha) == torch.Tensor: 61 | alpha = alpha.detach().float().numpy() # without casting, bf16 causes error 62 | alpha = self.lora_dim if alpha is None or alpha == 0 else alpha 63 | self.scale = alpha / self.lora_dim 64 | self.register_buffer("alpha", torch.tensor(alpha)) 65 | 66 | # same as microsoft's 67 | torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5)) 68 | torch.nn.init.zeros_(self.lora_up.weight) 69 | 70 | self.multiplier = multiplier 71 | self.org_module = org_module # remove in applying 72 | self.dropout = dropout 73 | self.rank_dropout = rank_dropout 74 | self.module_dropout = module_dropout 75 | 76 | def apply_to(self): 77 | self.org_forward = self.org_module.forward 78 | self.org_module.forward = self.forward 79 | del self.org_module 80 | 81 | def forward(self, x, *args, **kwargs): 82 | weight_dtype = x.dtype 83 | org_forwarded = self.org_forward(x) 84 | 85 | # module dropout 86 | if self.module_dropout is not None and self.training: 87 | if torch.rand(1) < self.module_dropout: 88 | return org_forwarded 89 | 90 | lx = self.lora_down(x.to(self.lora_down.weight.dtype)) 91 | 92 | # normal dropout 93 | if self.dropout is not None and self.training: 94 | lx = torch.nn.functional.dropout(lx, p=self.dropout) 95 | 96 | # rank dropout 97 | if self.rank_dropout is not None and self.training: 98 | mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout 99 | if len(lx.size()) == 3: 100 | mask = mask.unsqueeze(1) # for Text Encoder 101 | elif len(lx.size()) == 4: 102 | mask = mask.unsqueeze(-1).unsqueeze(-1) # for Conv2d 103 | lx = lx * mask 104 | 105 | # scaling for rank dropout: treat as if the rank is changed 106 | scale = self.scale * (1.0 / (1.0 - self.rank_dropout)) # redundant for readability 107 | else: 108 | scale = self.scale 109 | 110 | lx = self.lora_up(lx) 111 | 112 | return org_forwarded.to(weight_dtype) + lx.to(weight_dtype) * self.multiplier * scale 113 | 114 | 115 | def addnet_hash_legacy(b): 116 | """Old model hash used by sd-webui-additional-networks for .safetensors format files""" 117 | m = hashlib.sha256() 118 | 119 | b.seek(0x100000) 120 | m.update(b.read(0x10000)) 121 | return m.hexdigest()[0:8] 122 | 123 | 124 | def addnet_hash_safetensors(b): 125 | """New model hash used by sd-webui-additional-networks for .safetensors format files""" 126 | hash_sha256 = hashlib.sha256() 127 | blksize = 1024 * 1024 128 | 129 | b.seek(0) 130 | header = b.read(8) 131 | n = int.from_bytes(header, "little") 132 | 133 | offset = n + 8 134 | b.seek(offset) 135 | for chunk in iter(lambda: b.read(blksize), b""): 136 | hash_sha256.update(chunk) 137 | 138 | return hash_sha256.hexdigest() 139 | 140 | 141 | def precalculate_safetensors_hashes(tensors, metadata): 142 | """Precalculate the model hashes needed by sd-webui-additional-networks to 143 | save time on indexing the model later.""" 144 | 145 | # Because writing user metadata to the file can change the result of 146 | # sd_models.model_hash(), only retain the training metadata for purposes of 147 | # calculating the hash, as they are meant to be immutable 148 | metadata = {k: v for k, v in metadata.items() if k.startswith("ss_")} 149 | 150 | bytes = safetensors.torch.save(tensors, metadata) 151 | b = BytesIO(bytes) 152 | 153 | model_hash = addnet_hash_safetensors(b) 154 | legacy_hash = addnet_hash_legacy(b) 155 | return model_hash, legacy_hash 156 | 157 | 158 | class LoRANetwork(torch.nn.Module): 159 | TRANSFORMER_TARGET_REPLACE_MODULE = ["CogVideoXTransformer3DModel"] 160 | TEXT_ENCODER_TARGET_REPLACE_MODULE = ["T5LayerSelfAttention", "T5LayerFF", "BertEncoder"] 161 | LORA_PREFIX_TRANSFORMER = "lora_unet" 162 | LORA_PREFIX_TEXT_ENCODER = "lora_te" 163 | def __init__( 164 | self, 165 | text_encoder: Union[List[T5EncoderModel], T5EncoderModel], 166 | unet, 167 | multiplier: float = 1.0, 168 | lora_dim: int = 4, 169 | alpha: float = 1, 170 | dropout: Optional[float] = None, 171 | module_class: Type[object] = LoRAModule, 172 | add_lora_in_attn_temporal: bool = False, 173 | varbose: Optional[bool] = False, 174 | ) -> None: 175 | super().__init__() 176 | self.multiplier = multiplier 177 | 178 | self.lora_dim = lora_dim 179 | self.alpha = alpha 180 | self.dropout = dropout 181 | 182 | print(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}") 183 | print(f"neuron dropout: p={self.dropout}") 184 | 185 | # create module instances 186 | def create_modules( 187 | is_unet: bool, 188 | root_module: torch.nn.Module, 189 | target_replace_modules: List[torch.nn.Module], 190 | ) -> List[LoRAModule]: 191 | prefix = ( 192 | self.LORA_PREFIX_TRANSFORMER 193 | if is_unet 194 | else self.LORA_PREFIX_TEXT_ENCODER 195 | ) 196 | loras = [] 197 | skipped = [] 198 | for name, module in root_module.named_modules(): 199 | if module.__class__.__name__ in target_replace_modules: 200 | for child_name, child_module in module.named_modules(): 201 | is_linear = child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "LoRACompatibleLinear" 202 | is_conv2d = child_module.__class__.__name__ == "Conv2d" or child_module.__class__.__name__ == "LoRACompatibleConv" 203 | is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1) 204 | 205 | if not add_lora_in_attn_temporal: 206 | if "attn_temporal" in child_name: 207 | continue 208 | 209 | if is_linear or is_conv2d: 210 | lora_name = prefix + "." + name + "." + child_name 211 | lora_name = lora_name.replace(".", "_") 212 | 213 | dim = None 214 | alpha = None 215 | 216 | if is_linear or is_conv2d_1x1: 217 | dim = self.lora_dim 218 | alpha = self.alpha 219 | 220 | if dim is None or dim == 0: 221 | if is_linear or is_conv2d_1x1: 222 | skipped.append(lora_name) 223 | continue 224 | 225 | lora = module_class( 226 | lora_name, 227 | child_module, 228 | self.multiplier, 229 | dim, 230 | alpha, 231 | dropout=dropout, 232 | ) 233 | loras.append(lora) 234 | return loras, skipped 235 | 236 | text_encoders = text_encoder if type(text_encoder) == list else [text_encoder] 237 | 238 | self.text_encoder_loras = [] 239 | skipped_te = [] 240 | for i, text_encoder in enumerate(text_encoders): 241 | if text_encoder is not None: 242 | text_encoder_loras, skipped = create_modules(False, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE) 243 | self.text_encoder_loras.extend(text_encoder_loras) 244 | skipped_te += skipped 245 | print(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.") 246 | 247 | self.unet_loras, skipped_un = create_modules(True, unet, LoRANetwork.TRANSFORMER_TARGET_REPLACE_MODULE) 248 | print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.") 249 | 250 | # assertion 251 | names = set() 252 | for lora in self.text_encoder_loras + self.unet_loras: 253 | assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}" 254 | names.add(lora.lora_name) 255 | 256 | def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True): 257 | if apply_text_encoder: 258 | print("enable LoRA for text encoder") 259 | else: 260 | self.text_encoder_loras = [] 261 | 262 | if apply_unet: 263 | print("enable LoRA for U-Net") 264 | else: 265 | self.unet_loras = [] 266 | 267 | for lora in self.text_encoder_loras + self.unet_loras: 268 | lora.apply_to() 269 | self.add_module(lora.lora_name, lora) 270 | 271 | def set_multiplier(self, multiplier): 272 | self.multiplier = multiplier 273 | for lora in self.text_encoder_loras + self.unet_loras: 274 | lora.multiplier = self.multiplier 275 | 276 | def load_weights(self, file): 277 | if os.path.splitext(file)[1] == ".safetensors": 278 | from safetensors.torch import load_file 279 | 280 | weights_sd = load_file(file) 281 | else: 282 | weights_sd = torch.load(file, map_location="cpu") 283 | info = self.load_state_dict(weights_sd, False) 284 | return info 285 | 286 | def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr): 287 | self.requires_grad_(True) 288 | all_params = [] 289 | 290 | def enumerate_params(loras): 291 | params = [] 292 | for lora in loras: 293 | params.extend(lora.parameters()) 294 | return params 295 | 296 | if self.text_encoder_loras: 297 | param_data = {"params": enumerate_params(self.text_encoder_loras)} 298 | if text_encoder_lr is not None: 299 | param_data["lr"] = text_encoder_lr 300 | all_params.append(param_data) 301 | 302 | if self.unet_loras: 303 | param_data = {"params": enumerate_params(self.unet_loras)} 304 | if unet_lr is not None: 305 | param_data["lr"] = unet_lr 306 | all_params.append(param_data) 307 | 308 | return all_params 309 | 310 | def enable_gradient_checkpointing(self): 311 | pass 312 | 313 | def get_trainable_params(self): 314 | return self.parameters() 315 | 316 | def save_weights(self, file, dtype, metadata): 317 | if metadata is not None and len(metadata) == 0: 318 | metadata = None 319 | 320 | state_dict = self.state_dict() 321 | 322 | if dtype is not None: 323 | for key in list(state_dict.keys()): 324 | v = state_dict[key] 325 | v = v.detach().clone().to("cpu").to(dtype) 326 | state_dict[key] = v 327 | 328 | if os.path.splitext(file)[1] == ".safetensors": 329 | from safetensors.torch import save_file 330 | 331 | # Precalculate model hashes to save time on indexing 332 | if metadata is None: 333 | metadata = {} 334 | model_hash, legacy_hash = precalculate_safetensors_hashes(state_dict, metadata) 335 | metadata["sshs_model_hash"] = model_hash 336 | metadata["sshs_legacy_hash"] = legacy_hash 337 | 338 | save_file(state_dict, file, metadata) 339 | else: 340 | torch.save(state_dict, file) 341 | 342 | def create_network( 343 | multiplier: float, 344 | network_dim: Optional[int], 345 | network_alpha: Optional[float], 346 | text_encoder: Union[T5EncoderModel, List[T5EncoderModel]], 347 | transformer, 348 | neuron_dropout: Optional[float] = None, 349 | add_lora_in_attn_temporal: bool = False, 350 | **kwargs, 351 | ): 352 | if network_dim is None: 353 | network_dim = 4 # default 354 | if network_alpha is None: 355 | network_alpha = 1.0 356 | 357 | network = LoRANetwork( 358 | text_encoder, 359 | transformer, 360 | multiplier=multiplier, 361 | lora_dim=network_dim, 362 | alpha=network_alpha, 363 | dropout=neuron_dropout, 364 | add_lora_in_attn_temporal=add_lora_in_attn_temporal, 365 | varbose=True, 366 | ) 367 | return network 368 | 369 | def merge_lora(transformer, lora_path, multiplier, device='cpu', dtype=torch.float32, state_dict=None): 370 | LORA_PREFIX_TRANSFORMER = "lora_unet" 371 | LORA_PREFIX_TEXT_ENCODER = "lora_te" 372 | if state_dict is None: 373 | state_dict = load_file(lora_path, device=device) 374 | else: 375 | state_dict = state_dict 376 | updates = defaultdict(dict) 377 | for key, value in state_dict.items(): 378 | layer, elem = key.split('.', 1) 379 | updates[layer][elem] = value 380 | 381 | for layer, elems in updates.items(): 382 | 383 | # if "lora_te" in layer: 384 | # if transformer_only: 385 | # continue 386 | # else: 387 | # layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_") 388 | # curr_layer = pipeline.text_encoder 389 | #else: 390 | layer_infos = layer.split(LORA_PREFIX_TRANSFORMER + "_")[-1].split("_") 391 | curr_layer = transformer 392 | 393 | temp_name = layer_infos.pop(0) 394 | while len(layer_infos) > -1: 395 | try: 396 | curr_layer = curr_layer.__getattr__(temp_name) 397 | if len(layer_infos) > 0: 398 | temp_name = layer_infos.pop(0) 399 | elif len(layer_infos) == 0: 400 | break 401 | except Exception: 402 | if len(layer_infos) == 0: 403 | print('Error loading layer') 404 | if len(temp_name) > 0: 405 | temp_name += "_" + layer_infos.pop(0) 406 | else: 407 | temp_name = layer_infos.pop(0) 408 | 409 | weight_up = elems['lora_up.weight'].to(dtype).to(device) 410 | weight_down = elems['lora_down.weight'].to(dtype).to(device) 411 | if 'alpha' in elems.keys(): 412 | alpha = elems['alpha'].item() / weight_up.shape[1] 413 | else: 414 | alpha = 1.0 415 | 416 | curr_layer.weight.data = curr_layer.weight.data.to(device) 417 | try: 418 | if len(weight_up.shape) == 4: 419 | curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up.squeeze(3).squeeze(2), 420 | weight_down.squeeze(3).squeeze(2)).unsqueeze( 421 | 2).unsqueeze(3) 422 | else: 423 | curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up, weight_down) 424 | except: 425 | print(f"Could not apply LoRA weight in layer {layer}") 426 | 427 | return transformer 428 | 429 | # TODO: Refactor with merge_lora. 430 | def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.float32): 431 | """Unmerge state_dict in LoRANetwork from the pipeline in diffusers.""" 432 | LORA_PREFIX_UNET = "lora_unet" 433 | LORA_PREFIX_TEXT_ENCODER = "lora_te" 434 | state_dict = load_file(lora_path, device=device) 435 | 436 | updates = defaultdict(dict) 437 | for key, value in state_dict.items(): 438 | layer, elem = key.split('.', 1) 439 | updates[layer][elem] = value 440 | 441 | for layer, elems in updates.items(): 442 | 443 | if "lora_te" in layer: 444 | layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_") 445 | curr_layer = pipeline.text_encoder 446 | else: 447 | layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_") 448 | curr_layer = pipeline.transformer 449 | 450 | temp_name = layer_infos.pop(0) 451 | while len(layer_infos) > -1: 452 | try: 453 | curr_layer = curr_layer.__getattr__(temp_name) 454 | if len(layer_infos) > 0: 455 | temp_name = layer_infos.pop(0) 456 | elif len(layer_infos) == 0: 457 | break 458 | except Exception: 459 | if len(layer_infos) == 0: 460 | print('Error loading layer') 461 | if len(temp_name) > 0: 462 | temp_name += "_" + layer_infos.pop(0) 463 | else: 464 | temp_name = layer_infos.pop(0) 465 | 466 | weight_up = elems['lora_up.weight'].to(dtype) 467 | weight_down = elems['lora_down.weight'].to(dtype) 468 | if 'alpha' in elems.keys(): 469 | alpha = elems['alpha'].item() / weight_up.shape[1] 470 | else: 471 | alpha = 1.0 472 | 473 | curr_layer.weight.data = curr_layer.weight.data.to(device) 474 | if len(weight_up.shape) == 4: 475 | curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up.squeeze(3).squeeze(2), 476 | weight_down.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3) 477 | else: 478 | curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up, weight_down) 479 | 480 | return pipeline 481 | 482 | def load_lora_into_transformer(lora, transformer): 483 | from peft import LoraConfig, set_peft_model_state_dict 484 | from peft.mapping import PEFT_TYPE_TO_TUNER_MAPPING 485 | from peft.tuners.tuners_utils import BaseTunerLayer 486 | from diffusers.utils.peft_utils import get_peft_kwargs 487 | from diffusers.utils.import_utils import is_peft_version 488 | from diffusers.utils.state_dict_utils import convert_unet_state_dict_to_peft 489 | 490 | state_dict_list = [] 491 | adapter_name_list = [] 492 | strength_list = [] 493 | lora_config_list = [] 494 | 495 | for l in lora: 496 | state_dict = load_file(l["path"]) 497 | adapter_name_list.append(l["name"]) 498 | strength_list.append(l["strength"]) 499 | 500 | keys = list(state_dict.keys()) 501 | transformer_keys = [k for k in keys if k.startswith("transformer")] 502 | state_dict = { 503 | k.replace(f"transformer.", ""): v for k, v in state_dict.items() if k in transformer_keys 504 | } 505 | 506 | # check with first key if is not in peft format 507 | first_key = next(iter(state_dict.keys())) 508 | if "lora_A" not in first_key: 509 | state_dict = convert_unet_state_dict_to_peft(state_dict) 510 | 511 | rank = {} 512 | for key, val in state_dict.items(): 513 | if "lora_B" in key: 514 | rank[key] = val.shape[1] 515 | lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict) 516 | if "use_dora" in lora_config_kwargs: 517 | if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"): 518 | raise ValueError( 519 | "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." 520 | ) 521 | else: 522 | lora_config_kwargs.pop("use_dora") 523 | 524 | lora_config_list.append(LoraConfig(**lora_config_kwargs)) 525 | state_dict_list.append(state_dict) 526 | 527 | 528 | peft_models = [] 529 | 530 | for i in range(len(lora_config_list)): 531 | tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[lora_config_list[i].peft_type] 532 | peft_model = tuner_cls(transformer, lora_config_list[i], adapter_name=adapter_name_list[i]) 533 | incompatible_keys = set_peft_model_state_dict(peft_model.model, state_dict_list[i], adapter_name_list[i]) 534 | 535 | if incompatible_keys is not None: 536 | # check only for unexpected keys 537 | unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) 538 | if unexpected_keys: 539 | print( 540 | f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " 541 | f" {unexpected_keys}. " 542 | ) 543 | 544 | peft_models.append(peft_model) 545 | 546 | if len(peft_models) > 1: 547 | peft_models[0].add_weighted_adapter( 548 | adapters=adapter_name_list, 549 | weights=strength_list, 550 | combination_type="linear", 551 | adapter_name="combined_adapter" 552 | ) 553 | peft_models[0].set_adapter("combined_adapter") 554 | else: 555 | if strength_list[0] != 1.0: 556 | for module in transformer.modules(): 557 | if isinstance(module, BaseTunerLayer): 558 | #print(f"Setting strength for {module}") 559 | module.scale_layer(strength_list[0]) 560 | return peft_model.model -------------------------------------------------------------------------------- /mz_enable_vae_encode_tiling.py: -------------------------------------------------------------------------------- 1 | # thanks to MinusZoneAI: https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/b98b98bd04621e4c85547866c12de2ec723ae98a/mz_enable_vae_encode_tiling.py 2 | from typing import Optional 3 | import torch 4 | from diffusers.utils.accelerate_utils import apply_forward_hook 5 | from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution 6 | from diffusers.models.modeling_outputs import AutoencoderKLOutput 7 | 8 | 9 | @apply_forward_hook 10 | def encode( 11 | self, x: torch.Tensor, return_dict: bool = True 12 | ): 13 | """ 14 | Encode a batch of images into latents. 15 | Args: 16 | x (`torch.Tensor`): Input batch of images. 17 | return_dict (`bool`, *optional*, defaults to `True`): 18 | Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. 19 | Returns: 20 | The latent representations of the encoded videos. If `return_dict` is True, a 21 | [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned. 22 | """ 23 | if self.use_slicing and x.shape[0] > 1: 24 | encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)] 25 | h = torch.cat(encoded_slices) 26 | else: 27 | h = self._encode(x) 28 | posterior = DiagonalGaussianDistribution(h) 29 | 30 | if not return_dict: 31 | return (posterior,) 32 | return AutoencoderKLOutput(latent_dist=posterior) 33 | 34 | 35 | def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: 36 | r"""Encode a batch of images using a tiled encoder. 37 | When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several 38 | steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is 39 | different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the 40 | tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the 41 | output, but they should be much less noticeable. 42 | Args: 43 | x (`torch.Tensor`): Input batch of videos. 44 | Returns: 45 | `torch.Tensor`: 46 | The latent representation of the encoded videos. 47 | """ 48 | # For a rough memory estimate, take a look at the `tiled_decode` method. 49 | batch_size, num_channels, num_frames, height, width = x.shape 50 | overlap_height = int(self.tile_sample_min_height * 51 | (1 - self.tile_overlap_factor_height)) 52 | overlap_width = int(self.tile_sample_min_width * 53 | (1 - self.tile_overlap_factor_width)) 54 | blend_extent_height = int( 55 | self.tile_latent_min_height * self.tile_overlap_factor_height) 56 | blend_extent_width = int( 57 | self.tile_latent_min_width * self.tile_overlap_factor_width) 58 | row_limit_height = self.tile_latent_min_height - blend_extent_height 59 | row_limit_width = self.tile_latent_min_width - blend_extent_width 60 | frame_batch_size = 4 61 | # Split x into overlapping tiles and encode them separately. 62 | # The tiles have an overlap to avoid seams between tiles. 63 | rows = [] 64 | for i in range(0, height, overlap_height): 65 | row = [] 66 | for j in range(0, width, overlap_width): 67 | # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k. 68 | num_batches = num_frames // frame_batch_size if num_frames > 1 else 1 69 | time = [] 70 | for k in range(num_batches): 71 | remaining_frames = num_frames % frame_batch_size 72 | start_frame = frame_batch_size * k + \ 73 | (0 if k == 0 else remaining_frames) 74 | end_frame = frame_batch_size * (k + 1) + remaining_frames 75 | tile = x[ 76 | :, 77 | :, 78 | start_frame:end_frame, 79 | i: i + self.tile_sample_min_height, 80 | j: j + self.tile_sample_min_width, 81 | ] 82 | 83 | tile = self.encoder(tile) 84 | if not isinstance(tile, tuple): 85 | tile = (tile,) 86 | if self.quant_conv is not None: 87 | tile = self.quant_conv(tile) 88 | time.append(tile[0]) 89 | try: 90 | self._clear_fake_context_parallel_cache() 91 | except: 92 | pass 93 | row.append(torch.cat(time, dim=2)) 94 | rows.append(row) 95 | result_rows = [] 96 | for i, row in enumerate(rows): 97 | result_row = [] 98 | for j, tile in enumerate(row): 99 | # blend the above tile and the left tile 100 | # to the current tile and add the current tile to the result row 101 | if i > 0: 102 | tile = self.blend_v( 103 | rows[i - 1][j], tile, blend_extent_height) 104 | if j > 0: 105 | tile = self.blend_h(row[j - 1], tile, blend_extent_width) 106 | result_row.append( 107 | tile[:, :, :, :row_limit_height, :row_limit_width]) 108 | result_rows.append(torch.cat(result_row, dim=4)) 109 | enc = torch.cat(result_rows, dim=3) 110 | return enc 111 | 112 | 113 | def _encode( 114 | self, x: torch.Tensor, return_dict: bool = True 115 | ): 116 | batch_size, num_channels, num_frames, height, width = x.shape 117 | 118 | if self.use_encode_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): 119 | return self.tiled_encode(x) 120 | 121 | if num_frames == 1: 122 | h = self.encoder(x) 123 | if self.quant_conv is not None: 124 | h = self.quant_conv(h) 125 | posterior = DiagonalGaussianDistribution(h) 126 | else: 127 | frame_batch_size = 4 128 | h = [] 129 | for i in range(num_frames // frame_batch_size): 130 | remaining_frames = num_frames % frame_batch_size 131 | start_frame = frame_batch_size * i + \ 132 | (0 if i == 0 else remaining_frames) 133 | end_frame = frame_batch_size * (i + 1) + remaining_frames 134 | z_intermediate = x[:, :, start_frame:end_frame] 135 | z_intermediate = self.encoder(z_intermediate) 136 | if self.quant_conv is not None: 137 | z_intermediate = self.quant_conv(z_intermediate) 138 | h.append(z_intermediate) 139 | try: 140 | self._clear_fake_context_parallel_cache() 141 | except: 142 | pass 143 | h = torch.cat(h, dim=2) 144 | return h 145 | 146 | 147 | def enable_encode_tiling( 148 | self, 149 | tile_sample_min_height: Optional[int] = None, 150 | tile_sample_min_width: Optional[int] = None, 151 | tile_overlap_factor_height: Optional[float] = None, 152 | tile_overlap_factor_width: Optional[float] = None, 153 | ) -> None: 154 | r""" 155 | Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to 156 | compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow 157 | processing larger images. 158 | 159 | Args: 160 | tile_sample_min_height (`int`, *optional*): 161 | The minimum height required for a sample to be separated into tiles across the height dimension. 162 | tile_sample_min_width (`int`, *optional*): 163 | The minimum width required for a sample to be separated into tiles across the width dimension. 164 | tile_overlap_factor_height (`int`, *optional*): 165 | The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are 166 | no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher 167 | value might cause more tiles to be processed leading to slow down of the decoding process. 168 | tile_overlap_factor_width (`int`, *optional*): 169 | The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there 170 | are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher 171 | value might cause more tiles to be processed leading to slow down of the decoding process. 172 | """ 173 | self.use_encode_tiling = True 174 | self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height 175 | self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width 176 | self.tile_latent_min_height = int( 177 | self.tile_sample_min_height / 178 | (2 ** (len(self.config.block_out_channels) - 1)) 179 | ) 180 | self.tile_latent_min_width = int( 181 | self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1))) 182 | self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height 183 | self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width 184 | 185 | 186 | from types import MethodType 187 | 188 | 189 | def enable_vae_encode_tiling(vae): 190 | vae.encode = MethodType(encode, vae) 191 | setattr(vae, "_encode", MethodType(_encode, vae)) 192 | setattr(vae, "tiled_encode", MethodType(tiled_encode, vae)) 193 | setattr(vae, "use_encode_tiling", True) 194 | 195 | setattr(vae, "enable_encode_tiling", MethodType(enable_encode_tiling, vae)) 196 | vae.enable_encode_tiling() 197 | return vae 198 | -------------------------------------------------------------------------------- /mz_gguf_loader.py: -------------------------------------------------------------------------------- 1 | # https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/9616415220fd09388622f40f6609e4ed81f048a5/mz_gguf_loader.py 2 | 3 | import torch 4 | import torch.nn as nn 5 | import gc 6 | 7 | 8 | class quantize_lazy_load(): 9 | def __init__(self): 10 | self.device = None 11 | 12 | def __enter__(self): 13 | self.device = torch.device("meta") 14 | self.device.__enter__() 15 | return self 16 | 17 | def __exit__(self, exc_type, exc_value, traceback): 18 | self.device.__exit__(exc_type, exc_value, traceback) 19 | 20 | 21 | def quantize_load_state_dict(model, state_dict, device="cpu"): 22 | quant_keys = [] 23 | for key in state_dict.keys(): 24 | if key.endswith(".Q4_0_qweight"): 25 | quant_keys.append(key.replace(".Q4_0_qweight", "")) 26 | qtype = "Q4_0" 27 | elif key.endswith(".Q8_0_qweight"): 28 | quant_keys.append(key.replace(".Q8_0_qweight", "")) 29 | qtype = "Q8_0" 30 | 31 | for name, module in model.named_modules(): 32 | if name in quant_keys: 33 | q_linear = WQLinear_GGUF.from_linear( 34 | linear=module, 35 | device=device, 36 | qtype=qtype, 37 | ) 38 | set_op_by_name(model, name, q_linear) 39 | 40 | model.to_empty(device=device) 41 | model.load_state_dict(state_dict, strict=False) 42 | model.to(device) 43 | return model 44 | 45 | 46 | def set_op_by_name(layer, name, new_module): 47 | levels = name.split(".") 48 | if len(levels) > 1: 49 | mod_ = layer 50 | for l_idx in range(len(levels) - 1): 51 | if levels[l_idx].isdigit(): 52 | mod_ = mod_[int(levels[l_idx])] 53 | else: 54 | mod_ = getattr(mod_, levels[l_idx]) 55 | setattr(mod_, levels[-1], new_module) 56 | else: 57 | setattr(layer, name, new_module) 58 | 59 | 60 | import torch.nn.functional as F 61 | 62 | 63 | class WQLinear_GGUF(nn.Module): 64 | def __init__( 65 | self, in_features, out_features, bias, dev, qtype="Q4_0" 66 | ): 67 | super().__init__() 68 | 69 | self.in_features = in_features 70 | self.out_features = out_features 71 | self.qtype = qtype 72 | 73 | qweight_shape = quant_shape_to_byte_shape( 74 | (out_features, in_features), qtype 75 | ) 76 | self.register_buffer( 77 | f"{qtype}_qweight", 78 | torch.zeros( 79 | qweight_shape, 80 | dtype=torch.uint8, 81 | device=dev, 82 | ), 83 | ) 84 | if bias: 85 | self.register_buffer( 86 | "bias", 87 | torch.zeros( 88 | (out_features), 89 | dtype=torch.float16, 90 | device=dev, 91 | ), 92 | ) 93 | else: 94 | self.bias = None 95 | 96 | @classmethod 97 | def from_linear( 98 | cls, linear, 99 | device="cpu", 100 | qtype="Q4_0", 101 | ): 102 | q_linear = cls( 103 | linear.in_features, 104 | linear.out_features, 105 | linear.bias is not None, 106 | device, 107 | qtype=qtype, 108 | ) 109 | return q_linear 110 | 111 | def extra_repr(self) -> str: 112 | return ( 113 | "in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format( 114 | self.in_features, 115 | self.out_features, 116 | self.bias is not None, 117 | self.w_bit, 118 | self.group_size, 119 | ) 120 | ) 121 | 122 | @torch.no_grad() 123 | def forward(self, x): 124 | if self.qtype == "Q4_0": 125 | dequant = dequantize_blocks_Q4_0(self.Q4_0_qweight, x.dtype) 126 | elif self.qtype == "Q8_0": 127 | dequant = dequantize_blocks_Q8_0(self.Q8_0_qweight, x.dtype) 128 | else: 129 | raise ValueError(f"Unknown qtype: {self.qtype}") 130 | 131 | return F.linear(x, dequant, bias=self.bias.to(x.dtype) if self.bias is not None else None) 132 | 133 | 134 | def split_block_dims(blocks, *args): 135 | n_max = blocks.shape[1] 136 | dims = list(args) + [n_max - sum(args)] 137 | return torch.split(blocks, dims, dim=1) 138 | 139 | 140 | def quant_shape_to_byte_shape(shape, qtype) -> tuple[int, ...]: 141 | # shape = shape[::-1] 142 | block_size, type_size = GGML_QUANT_SIZES[qtype] 143 | if shape[-1] % block_size != 0: 144 | raise ValueError( 145 | f"Quantized tensor row size ({shape[-1]}) is not a multiple of Q4_0 block size ({block_size})") 146 | return (*shape[:-1], shape[-1] // block_size * type_size) 147 | 148 | 149 | def quant_shape_from_byte_shape(shape, qtype) -> tuple[int, ...]: 150 | # shape = shape[::-1] 151 | block_size, type_size = GGML_QUANT_SIZES[qtype] 152 | if shape[-1] % type_size != 0: 153 | raise ValueError( 154 | f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of Q4_0 type size ({type_size})") 155 | return (*shape[:-1], shape[-1] // type_size * block_size) 156 | 157 | 158 | GGML_QUANT_SIZES = { 159 | "Q4_0": (32, 2 + 16), 160 | "Q8_0": (32, 2 + 32), 161 | } 162 | 163 | 164 | def dequantize_blocks_Q4_0(data, dtype=torch.float16): 165 | block_size, type_size = GGML_QUANT_SIZES["Q4_0"] 166 | 167 | data = data.to(torch.uint8) 168 | shape = data.shape 169 | 170 | rows = data.reshape( 171 | (-1, data.shape[-1]) 172 | ).view(torch.uint8) 173 | 174 | n_blocks = rows.numel() // type_size 175 | blocks = data.reshape((n_blocks, type_size)) 176 | 177 | n_blocks = blocks.shape[0] 178 | 179 | d, qs = split_block_dims(blocks, 2) 180 | d = d.view(torch.float16) 181 | 182 | qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor( 183 | [0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) 184 | qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8 185 | 186 | out = (d * qs) 187 | 188 | out = out.reshape(quant_shape_from_byte_shape( 189 | shape, 190 | qtype="Q4_0", 191 | )).to(dtype) 192 | return out 193 | 194 | def dequantize_blocks_Q8_0(data, dtype=torch.float16): 195 | block_size, type_size = GGML_QUANT_SIZES["Q8_0"] 196 | 197 | data = data.to(torch.uint8) 198 | shape = data.shape 199 | 200 | rows = data.reshape( 201 | (-1, data.shape[-1]) 202 | ).view(torch.uint8) 203 | 204 | n_blocks = rows.numel() // type_size 205 | blocks = data.reshape((n_blocks, type_size)) 206 | 207 | n_blocks = blocks.shape[0] 208 | 209 | d, qs = split_block_dims(blocks, 2) 210 | d = d.view(torch.float16).to(torch.float32) 211 | 212 | qs = qs.view(torch.int8).to(torch.float32) 213 | 214 | out = (d * qs) 215 | 216 | out = out.reshape(quant_shape_from_byte_shape( 217 | shape, 218 | qtype="Q8_0", 219 | )).to(dtype) 220 | return out 221 | 222 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui-cogvideoxwrapper" 3 | description = "Diffusers wrapper for CogVideoX -models: https://github.com/THUDM/CogVideo" 4 | version = "1.5.1" 5 | license = {file = "LICENSE"} 6 | dependencies = ["huggingface_hub", "diffusers>=0.31.0", "accelerate>=0.33.0"] 7 | 8 | [project.urls] 9 | Repository = "https://github.com/kijai/ComfyUI-CogVideoXWrapper" 10 | # Used by Comfy Registry https://comfyregistry.org 11 | 12 | [tool.comfy] 13 | PublisherId = "kijai" 14 | DisplayName = "ComfyUI-CogVideoXWrapper" 15 | Icon = "" 16 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # WORK IN PROGRESS 2 | 3 | Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing 4 | 5 | ## Update 9 6 | Added preliminary support for [Go-with-the-Flow](https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow) 7 | 8 | This uses LoRA weights available here: https://huggingface.co/Eyeline-Research/Go-with-the-Flow/tree/main 9 | 10 | To create the input videos for the NoiseWarp process, I've added a node to KJNodes that works alongside my SplineEditor, and either [comfyui-inpaint-nodes](https://github.com/Acly/comfyui-inpaint-nodes) or just cv2 inpainting to create the cut and drag input videos. 11 | 12 | The workflows are in the example_workflows -folder. 13 | 14 | Quick video to showcase: First mask the subject, then use the cut and drag -workflow to create a video as seen here, then that video is used as input to the NoiseWarp node in the main workflow. 15 | 16 | https://github.com/user-attachments/assets/112706b0-a38b-4c3c-b779-deba0827af4f 17 | 18 | ## BREAKING Update8 19 | 20 | This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are. 21 | I apologize for the inconvenience, if I don't do this now I'll keep making it worse until maintaining becomes too much of a chore, so from my pov there was no choice. 22 | 23 | *Please either use the new workflows or fix the nodes in your old ones before posting issue reports!* 24 | 25 | Old version will be kept in a legacy branch, but not maintained 26 | 27 | - Support CogVideoX 1.5 models 28 | - Major code cleanup (it was bad, still isn't great, wip) 29 | - Merge Fun -model functionality into main pipeline: 30 | - All Fun specific nodes, besides image encode node for Fun -InP models are gone 31 | - Main CogVideo Sampler works with Fun models 32 | - DimensionX LoRAs now work with Fun models as well 33 | 34 | - Remove width/height from the sampler widgets and detect from input instead, this meanst text2vid now requires using empty latents 35 | - Separate VAE from the model, allow using fp32 VAE 36 | - Add ability to load some of the non-GGUF models as single files (only few available for now: https://huggingface.co/Kijai/CogVideoX-comfy) 37 | - Add some torchao quantizations as options 38 | - Add interpolation as option for the main encode node, old interpolation specific node is gone 39 | - torch.compile optimizations 40 | - Remove PAB in favor of FasterCache and cleaner code 41 | - other smaller things I forgot about at this point 42 | 43 | For Fun -model based workflows it's more drastic change, for others migrating generally means re-setting many of the nodes. 44 | 45 | ## Update7 46 | 47 | - Refactored the Fun version's sampler to accept any resolution, this should make it lot simpler to use with Tora. **BREAKS OLD WORKFLOWS**, old FunSampler nodes need to be remade. 48 | - The old bucket resizing is now on it's own node (CogVideoXFunResizeToClosestBucket) to keep the functionality, I honestly don't know if it matters at all, but just in case. 49 | - Fun version's vid2vid is now also in the same node, the old vid2vid node is deprecated. 50 | - Added support for FasterCache, this trades more VRAM use for speed with slight quality hit, similar to PAB: https://github.com/Vchitect/FasterCache 51 | - Improved torch.compile support, it actually works now 52 | 53 | ## Update6 54 | 55 | Initial support for Tora (https://github.com/alibaba/Tora) 56 | 57 | Converted model (included in the autodownload node): 58 | 59 | https://huggingface.co/Kijai/CogVideoX-5b-Tora/tree/main 60 | 61 | 62 | https://github.com/user-attachments/assets/d5334237-03dc-48f5-8bec-3ae5998660c6 63 | 64 | 65 | ## Update5 66 | This week there's been some bigger updates that will most likely affect some old workflows, sampler node especially probably need to be refreshed (re-created) if it errors out! 67 | 68 | New features: 69 | - Initial context windowing with FreeNoise noise shuffling mainly for vid2vid and pose2vid pipelines for longer generations, haven't figured it out for img2vid yet 70 | - GGUF models and tiled encoding for I2V and pose pipelines (thanks to MinusZoneAI) 71 | - [sageattention](https://github.com/thu-ml/SageAttention) support (Linux only) for a speed boost, I experienced ~20-30% increase with it, stacks with fp8 fast mode, doesn't need compiling 72 | - Support CogVideoX-Fun 1.1 and it's pose models with additional control strength and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work 73 | - Support LoRAs 74 | 75 | https://github.com/user-attachments/assets/ddeb8f38-a647-42b3-a4b1-c6936f961deb 76 | 77 | https://github.com/user-attachments/assets/c78b2832-9571-4941-8c97-fbcc1a4cc23d 78 | 79 | https://github.com/user-attachments/assets/d9ed98b1-f917-432b-a16e-e01e87efb1f9 80 | 81 | 82 | 83 | ## Update4 84 | Initial support for the official I2V version of CogVideoX: https://huggingface.co/THUDM/CogVideoX-5b-I2V 85 | 86 | **Also needs diffusers 0.30.3** 87 | 88 | https://github.com/user-attachments/assets/c672d0af-a676-495d-a42c-7e3dd802b4b0 89 | 90 | 91 | 92 | ## Update3 93 | 94 | Added initial support for CogVideoX-Fun: https://github.com/aigc-apps/CogVideoX-Fun 95 | 96 | Note that while this one can do image2vid, this is NOT the official I2V model yet, though it should also be released very soon. 97 | 98 | https://github.com/user-attachments/assets/68f9ed16-ee53-4955-b931-1799461ac561 99 | 100 | 101 | ## Updade2 102 | 103 | Added **experimental** support for onediff, this reduced sampling time by ~40% for me, reaching 4.23 s/it on 4090 with 49 frames. 104 | This requires using Linux, torch 2.4.0, onediff and nexfort installation: 105 | 106 | `pip install --pre onediff onediffx` 107 | 108 | `pip install nexfort` 109 | 110 | First run will take around 5 mins for the compilation. 111 | 112 | ## Update 113 | 5b model is now also supported for basic text2vid: https://huggingface.co/THUDM/CogVideoX-5b 114 | 115 | It is also autodownloaded to `ComfyUI/models/CogVideo/CogVideoX-5b`, text encoder is not needed as we use the ComfyUI T5. 116 | 117 | https://github.com/user-attachments/assets/991205cc-826e-4f93-831a-c10441f0f2ce 118 | 119 | Requires diffusers 0.30.1 (this is specified in requirements.txt) 120 | 121 | Uses same T5 model than SD3 and Flux, fp8 works fine too. Memory requirements depend mostly on the video length. 122 | VAE decoding seems to be the only big that takes a lot of VRAM when everything is offloaded, peaks at around 13-14GB momentarily at that stage. 123 | Sampling itself takes only maybe 5-6GB. 124 | 125 | 126 | Hacked in img2img to attempt vid2vid workflow, works interestingly with some inputs, highly experimental. 127 | 128 | https://github.com/user-attachments/assets/e6951ef4-ea7a-4752-94f6-cf24f2503d83 129 | 130 | https://github.com/user-attachments/assets/9e41f37b-2bb3-411c-81fa-e91b80da2559 131 | 132 | Also added temporal tiling as means of generating endless videos: 133 | 134 | https://github.com/kijai/ComfyUI-CogVideoXWrapper 135 | 136 | https://github.com/user-attachments/assets/ecdac8b8-d434-48b6-abd6-90755b6b552d 137 | 138 | 139 | 140 | Original repo: 141 | https://github.com/THUDM/CogVideo 142 | 143 | CogVideoX-Fun: 144 | https://github.com/aigc-apps/CogVideoX-Fun 145 | 146 | Controlnet: 147 | https://github.com/TheDenk/cogvideox-controlnet 148 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub 2 | diffusers>=0.31.0 3 | accelerate>=0.33.0 4 | einops 5 | peft 6 | opencv-python -------------------------------------------------------------------------------- /tora/traj_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from einops import rearrange, reduce 5 | 6 | 7 | def avg_pool_nd(dims, *args, **kwargs): 8 | """ 9 | Create a 1D, 2D, or 3D average pooling module. 10 | """ 11 | if dims == 1: 12 | return nn.AvgPool1d(*args, **kwargs) 13 | elif dims == 2: 14 | return nn.AvgPool2d(*args, **kwargs) 15 | elif dims == 3: 16 | return nn.AvgPool3d(*args, **kwargs) 17 | raise ValueError(f"unsupported dimensions: {dims}") 18 | 19 | 20 | def conv_nd(dims, *args, **kwargs): 21 | """ 22 | Create a 1D, 2D, or 3D convolution module. 23 | """ 24 | if dims == 1: 25 | return nn.Conv1d(*args, **kwargs) 26 | elif dims == 2: 27 | return nn.Conv2d(*args, **kwargs) 28 | elif dims == 3: 29 | return nn.Conv3d(*args, **kwargs) 30 | raise ValueError(f"unsupported dimensions: {dims}") 31 | 32 | 33 | class Downsample(nn.Module): 34 | """ 35 | A downsampling layer with an optional convolution. 36 | :param channels: channels in the inputs and outputs. 37 | :param use_conv: a bool determining if a convolution is applied. 38 | :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then 39 | downsampling occurs in the inner-two dimensions. 40 | """ 41 | 42 | def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): 43 | super().__init__() 44 | self.channels = channels 45 | self.out_channels = out_channels or channels 46 | self.use_conv = use_conv 47 | self.dims = dims 48 | stride = 2 if dims != 3 else (1, 2, 2) 49 | if use_conv: 50 | self.op = conv_nd( 51 | dims, 52 | self.channels, 53 | self.out_channels, 54 | 3, 55 | stride=stride, 56 | padding=padding, 57 | ) 58 | else: 59 | assert self.channels == self.out_channels 60 | self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) 61 | 62 | def forward(self, x): 63 | assert x.shape[1] == self.channels 64 | return self.op(x) 65 | 66 | 67 | class ResnetBlock(nn.Module): 68 | def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True): 69 | super().__init__() 70 | ps = ksize // 2 71 | if in_c != out_c or sk == False: 72 | self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps) 73 | else: 74 | # print('n_in') 75 | self.in_conv = None 76 | self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1) 77 | self.act = nn.ReLU() 78 | self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps) 79 | self.bn1 = nn.BatchNorm2d(out_c) 80 | self.bn2 = nn.BatchNorm2d(out_c) 81 | if sk == False: 82 | # self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps) # edit by zhouxiawang 83 | self.skep = nn.Conv2d(out_c, out_c, ksize, 1, ps) 84 | else: 85 | self.skep = None 86 | 87 | self.down = down 88 | if self.down == True: 89 | self.down_opt = Downsample(in_c, use_conv=use_conv) 90 | 91 | def forward(self, x): 92 | if self.down == True: 93 | x = self.down_opt(x) 94 | if self.in_conv is not None: # edit 95 | x = self.in_conv(x) 96 | 97 | h = self.bn1(x) 98 | h = self.act(h) 99 | h = self.block1(h) 100 | h = self.bn2(h) 101 | h = self.act(h) 102 | h = self.block2(h) 103 | if self.skep is not None: 104 | return h + self.skep(x) 105 | else: 106 | return h + x 107 | 108 | 109 | class VAESpatialEmulator(nn.Module): 110 | def __init__(self, kernel_size=(8, 8)): 111 | super().__init__() 112 | self.kernel_size = kernel_size 113 | 114 | def forward(self, x): 115 | """ 116 | x: torch.Tensor: shape [B C T H W] 117 | """ 118 | Hp, Wp = self.kernel_size 119 | H, W = x.shape[-2], x.shape[-1] 120 | valid_h = H - H % Hp 121 | valid_w = W - W % Wp 122 | x = x[..., :valid_h, :valid_w] 123 | x = rearrange( 124 | x, 125 | "B C T (Nh Hp) (Nw Wp) -> B (Hp Wp C) T Nh Nw", 126 | Hp=Hp, 127 | Wp=Wp, 128 | ) 129 | return x 130 | 131 | 132 | class VAETemporalEmulator(nn.Module): 133 | def __init__(self, micro_frame_size, kernel_size=4): 134 | super().__init__() 135 | self.micro_frame_size = micro_frame_size 136 | self.kernel_size = kernel_size 137 | 138 | def forward(self, x_z): 139 | """ 140 | x_z: torch.Tensor: shape [B C T H W] 141 | """ 142 | 143 | z_list = [] 144 | for i in range(0, x_z.shape[2], self.micro_frame_size): 145 | x_z_bs = x_z[:, :, i : i + self.micro_frame_size] 146 | z_list.append(x_z_bs[:, :, 0:1]) 147 | x_z_bs = x_z_bs[:, :, 1:] 148 | t_valid = x_z_bs.shape[2] - x_z_bs.shape[2] % self.kernel_size 149 | x_z_bs = x_z_bs[:, :, :t_valid] 150 | x_z_bs = reduce(x_z_bs, "B C (T n) H W -> B C T H W", n=self.kernel_size, reduction="mean") 151 | z_list.append(x_z_bs) 152 | z = torch.cat(z_list, dim=2) 153 | return z 154 | 155 | 156 | class TrajExtractor(nn.Module): 157 | def __init__( 158 | self, 159 | vae_downsize=(4, 8, 8), 160 | patch_size=2, 161 | channels=[320, 640, 1280, 1280], 162 | nums_rb=3, 163 | cin=2, 164 | ksize=3, 165 | sk=False, 166 | use_conv=True, 167 | ): 168 | super(TrajExtractor, self).__init__() 169 | self.vae_downsize = vae_downsize 170 | # self.vae_spatial_emulator = VAESpatialEmulator(kernel_size=vae_downsize[-2:]) 171 | self.downsize_patchify = nn.PixelUnshuffle(patch_size) 172 | self.patch_size = (1, patch_size, patch_size) 173 | self.channels = channels 174 | self.nums_rb = nums_rb 175 | self.body = [] 176 | for i in range(len(channels)): 177 | for j in range(nums_rb): 178 | if (i != 0) and (j == 0): 179 | self.body.append( 180 | ResnetBlock( 181 | channels[i - 1], 182 | channels[i], 183 | down=False, 184 | ksize=ksize, 185 | sk=sk, 186 | use_conv=use_conv, 187 | ) 188 | ) 189 | else: 190 | self.body.append( 191 | ResnetBlock( 192 | channels[i], 193 | channels[i], 194 | down=False, 195 | ksize=ksize, 196 | sk=sk, 197 | use_conv=use_conv, 198 | ) 199 | ) 200 | self.body = nn.ModuleList(self.body) 201 | cin_ = cin * patch_size**2 202 | self.conv_in = nn.Conv2d(cin_, channels[0], 3, 1, 1) 203 | 204 | # Initialize weights 205 | def conv_init(module): 206 | if isinstance(module, (nn.Conv2d, nn.Conv1d)): 207 | nn.init.kaiming_normal_(module.weight, nonlinearity="relu") 208 | if module.bias is not None: 209 | nn.init.constant_(module.bias, 0) 210 | 211 | self.apply(conv_init) 212 | 213 | def forward(self, x): 214 | """ 215 | x: torch.Tensor: shape [B C T H W] 216 | """ 217 | # downsize 218 | T, H, W = x.shape[-3:] 219 | if W % self.patch_size[2] != 0: 220 | x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) 221 | if H % self.patch_size[1] != 0: 222 | x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) 223 | if T % self.patch_size[0] != 0: 224 | x = F.pad( 225 | x, 226 | (0, 0, 0, 0, 0, self.patch_size[0] - T % self.patch_size[0]), 227 | ) 228 | x = rearrange(x, "B C T H W -> (B T) C H W") 229 | x = self.downsize_patchify(x) 230 | 231 | # extract features 232 | features = [] 233 | x = self.conv_in(x) 234 | for i in range(len(self.channels)): 235 | for j in range(self.nums_rb): 236 | idx = i * self.nums_rb + j 237 | x = self.body[idx](x) 238 | features.append(x) 239 | 240 | return features 241 | 242 | 243 | class FloatGroupNorm(nn.GroupNorm): 244 | def forward(self, x): 245 | return super().forward(x.to(self.bias.dtype)).type(x.dtype) 246 | 247 | 248 | def zero_module(module): 249 | """ 250 | Zero out the parameters of a module and return it. 251 | """ 252 | for p in module.parameters(): 253 | p.detach().zero_() 254 | return module 255 | 256 | 257 | class MGF(nn.Module): 258 | def __init__(self, flow_in_channel=128, out_channels=1152): 259 | super().__init__() 260 | self.out_channels = out_channels 261 | self.flow_gamma_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1) 262 | self.flow_gamma_temporal = zero_module( 263 | nn.Conv1d( 264 | self.out_channels // 4, 265 | self.out_channels, 266 | kernel_size=3, 267 | stride=1, 268 | padding=1, 269 | padding_mode="replicate", 270 | ) 271 | ) 272 | self.flow_beta_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1) 273 | self.flow_beta_temporal = zero_module( 274 | nn.Conv1d( 275 | self.out_channels // 4, 276 | self.out_channels, 277 | kernel_size=3, 278 | stride=1, 279 | padding=1, 280 | padding_mode="replicate", 281 | ) 282 | ) 283 | self.flow_cond_norm = FloatGroupNorm(32, self.out_channels) 284 | 285 | def forward(self, h, flow, T): 286 | if flow is not None: 287 | gamma_flow = self.flow_gamma_spatial(flow) 288 | beta_flow = self.flow_beta_spatial(flow) 289 | _, _, hh, wh = beta_flow.shape 290 | 291 | if gamma_flow.shape[0] == 1: # Check if batch size is 1 292 | gamma_flow = rearrange(gamma_flow, "b c h w -> b c (h w)") 293 | beta_flow = rearrange(beta_flow, "b c h w -> b c (h w)") 294 | gamma_flow = self.flow_gamma_temporal(gamma_flow) 295 | beta_flow = self.flow_beta_temporal(beta_flow) 296 | gamma_flow = rearrange(gamma_flow, "b c (h w) -> b c h w", h=hh, w=wh) 297 | beta_flow = rearrange(beta_flow, "b c (h w) -> b c h w", h=hh, w=wh) 298 | else: 299 | gamma_flow = rearrange(gamma_flow, "(b f) c h w -> (b h w) c f", f=T) 300 | beta_flow = rearrange(beta_flow, "(b f) c h w -> (b h w) c f", f=T) 301 | gamma_flow = self.flow_gamma_temporal(gamma_flow) 302 | beta_flow = self.flow_beta_temporal(beta_flow) 303 | gamma_flow = rearrange(gamma_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh) 304 | beta_flow = rearrange(beta_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh) 305 | 306 | h = h + self.flow_cond_norm(h) * gamma_flow + beta_flow 307 | return h 308 | -------------------------------------------------------------------------------- /tora/traj_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import torch 4 | 5 | # Note that the coordinates passed to the model must not exceed 256. 6 | # xy range 256 7 | 8 | def pdf2(sigma_matrix, grid): 9 | """Calculate PDF of the bivariate Gaussian distribution. 10 | Args: 11 | sigma_matrix (ndarray): with the shape (2, 2) 12 | grid (ndarray): generated by :func:`mesh_grid`, 13 | with the shape (K, K, 2), K is the kernel size. 14 | Returns: 15 | kernel (ndarrray): un-normalized kernel. 16 | """ 17 | inverse_sigma = np.linalg.inv(sigma_matrix) 18 | kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2)) 19 | return kernel 20 | 21 | 22 | def mesh_grid(kernel_size): 23 | """Generate the mesh grid, centering at zero. 24 | Args: 25 | kernel_size (int): 26 | Returns: 27 | xy (ndarray): with the shape (kernel_size, kernel_size, 2) 28 | xx (ndarray): with the shape (kernel_size, kernel_size) 29 | yy (ndarray): with the shape (kernel_size, kernel_size) 30 | """ 31 | ax = np.arange(-kernel_size // 2 + 1.0, kernel_size // 2 + 1.0) 32 | xx, yy = np.meshgrid(ax, ax) 33 | xy = np.hstack( 34 | ( 35 | xx.reshape((kernel_size * kernel_size, 1)), 36 | yy.reshape(kernel_size * kernel_size, 1), 37 | ) 38 | ).reshape(kernel_size, kernel_size, 2) 39 | return xy, xx, yy 40 | 41 | 42 | def sigma_matrix2(sig_x, sig_y, theta): 43 | """Calculate the rotated sigma matrix (two dimensional matrix). 44 | Args: 45 | sig_x (float): 46 | sig_y (float): 47 | theta (float): Radian measurement. 48 | Returns: 49 | ndarray: Rotated sigma matrix. 50 | """ 51 | d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]]) 52 | u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) 53 | return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T)) 54 | 55 | 56 | def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True): 57 | """Generate a bivariate isotropic or anisotropic Gaussian kernel. 58 | In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored. 59 | Args: 60 | kernel_size (int): 61 | sig_x (float): 62 | sig_y (float): 63 | theta (float): Radian measurement. 64 | grid (ndarray, optional): generated by :func:`mesh_grid`, 65 | with the shape (K, K, 2), K is the kernel size. Default: None 66 | isotropic (bool): 67 | Returns: 68 | kernel (ndarray): normalized kernel. 69 | """ 70 | if grid is None: 71 | grid, _, _ = mesh_grid(kernel_size) 72 | if isotropic: 73 | sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]]) 74 | else: 75 | sigma_matrix = sigma_matrix2(sig_x, sig_y, theta) 76 | kernel = pdf2(sigma_matrix, grid) 77 | kernel = kernel / np.sum(kernel) 78 | return kernel 79 | 80 | size = 99 81 | sigma = 10 82 | blur_kernel = bivariate_Gaussian(size, sigma, sigma, 0, grid=None, isotropic=True) 83 | blur_kernel = blur_kernel / blur_kernel[size // 2, size // 2] 84 | 85 | canvas_width, canvas_height = 256, 256 86 | 87 | def get_flow(points, optical_flow, video_len): 88 | for i in range(video_len - 1): 89 | p = points[i] 90 | p1 = points[i + 1] 91 | optical_flow[i + 1, p[1], p[0], 0] = p1[0] - p[0] 92 | optical_flow[i + 1, p[1], p[0], 1] = p1[1] - p[1] 93 | 94 | return optical_flow 95 | 96 | 97 | def process_points(points, frames=49): 98 | defualt_points = [[128, 128]] * frames 99 | 100 | if len(points) < 2: 101 | return defualt_points 102 | 103 | elif len(points) >= frames: 104 | skip = len(points) // frames 105 | return points[::skip][: frames - 1] + points[-1:] 106 | else: 107 | insert_num = frames - len(points) 108 | insert_num_dict = {} 109 | interval = len(points) - 1 110 | n = insert_num // interval 111 | m = insert_num % interval 112 | for i in range(interval): 113 | insert_num_dict[i] = n 114 | for i in range(m): 115 | insert_num_dict[i] += 1 116 | 117 | res = [] 118 | for i in range(interval): 119 | insert_points = [] 120 | x0, y0 = points[i] 121 | x1, y1 = points[i + 1] 122 | 123 | delta_x = x1 - x0 124 | delta_y = y1 - y0 125 | for j in range(insert_num_dict[i]): 126 | x = x0 + (j + 1) / (insert_num_dict[i] + 1) * delta_x 127 | y = y0 + (j + 1) / (insert_num_dict[i] + 1) * delta_y 128 | insert_points.append([int(x), int(y)]) 129 | 130 | res += points[i : i + 1] + insert_points 131 | res += points[-1:] 132 | return res 133 | 134 | 135 | def read_points_from_list(traj_list, video_len=16, reverse=False): 136 | points = [] 137 | for point in traj_list: 138 | if isinstance(point, str): 139 | x, y = point.strip().split(",") 140 | else: 141 | x, y = point[0], point[1] 142 | points.append((int(x), int(y))) 143 | if reverse: 144 | points = points[::-1] 145 | 146 | if len(points) > video_len: 147 | skip = len(points) // video_len 148 | points = points[::skip] 149 | points = points[:video_len] 150 | 151 | return points 152 | 153 | 154 | def read_points_from_file(file, video_len=16, reverse=False): 155 | with open(file, "r") as f: 156 | lines = f.readlines() 157 | points = [] 158 | for line in lines: 159 | x, y = line.strip().split(",") 160 | points.append((int(x), int(y))) 161 | if reverse: 162 | points = points[::-1] 163 | 164 | if len(points) > video_len: 165 | skip = len(points) // video_len 166 | points = points[::skip] 167 | points = points[:video_len] 168 | 169 | return points 170 | 171 | 172 | def process_traj(trajs_list, num_frames, video_size, device="cpu"): 173 | if trajs_list and trajs_list[0] and (not isinstance(trajs_list[0][0], (list, tuple))): 174 | tmp = trajs_list 175 | trajs_list = [tmp] 176 | 177 | optical_flow = np.zeros((num_frames, video_size[0], video_size[1], 2), dtype=np.float32) 178 | processed_points = [] 179 | for traj_list in trajs_list: 180 | points = read_points_from_list(traj_list, video_len=num_frames) 181 | xy_range = 256 182 | h, w = video_size 183 | points = process_points(points, num_frames) 184 | points = [[int(w * x / xy_range), int(h * y / xy_range)] for x, y in points] 185 | optical_flow = get_flow(points, optical_flow, video_len=num_frames) 186 | processed_points.append(points) 187 | 188 | print(f"received {len(trajs_list)} trajectorie(s)") 189 | 190 | for i in range(1, num_frames): 191 | optical_flow[i] = cv2.filter2D(optical_flow[i], -1, blur_kernel) 192 | 193 | optical_flow = torch.tensor(optical_flow).to(device) 194 | 195 | return optical_flow, processed_points 196 | 197 | 198 | def add_provided_traj(traj_name): 199 | global traj_list 200 | traj_list = PROVIDED_TRAJS[traj_name] 201 | traj_str = [f"{traj}" for traj in traj_list] 202 | return ", ".join(traj_str) 203 | 204 | 205 | def scale_traj_list_to_256(traj_list, canvas_width, canvas_height): 206 | scale_x = 256 / canvas_width 207 | scale_y = 256 / canvas_height 208 | scaled_traj_list = [[int(x * scale_x), int(y * scale_y)] for x, y in traj_list] 209 | return scaled_traj_list -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import torch 3 | import logging 4 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 5 | log = logging.getLogger(__name__) 6 | 7 | def check_diffusers_version(): 8 | try: 9 | version = importlib.metadata.version('diffusers') 10 | required_version = '0.31.0' 11 | if version < required_version: 12 | raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.") 13 | except importlib.metadata.PackageNotFoundError: 14 | raise AssertionError("diffusers is not installed.") 15 | 16 | def remove_specific_blocks(model, block_indices_to_remove): 17 | import torch.nn as nn 18 | transformer_blocks = model.transformer_blocks 19 | new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove] 20 | model.transformer_blocks = nn.ModuleList(new_blocks) 21 | 22 | return model 23 | 24 | def print_memory(device): 25 | memory = torch.cuda.memory_allocated(device) / 1024**3 26 | max_memory = torch.cuda.max_memory_allocated(device) / 1024**3 27 | max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3 28 | log.info(f"Allocated memory: {memory=:.3f} GB") 29 | log.info(f"Max allocated memory: {max_memory=:.3f} GB") 30 | log.info(f"Max reserved memory: {max_reserved=:.3f} GB") --------------------------------------------------------------------------------